In [1]:
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('UcdpPrioConflict_v23_1.csv', encoding='latin-1')

In [3]:
df.shape

(2626, 28)

In [4]:
df.head()

Unnamed: 0,conflict_id,location,side_a,side_a_id,side_a_2nd,side_b,side_b_id,side_b_2nd,incompatibility,territory_name,...,ep_end,ep_end_date,ep_end_prec,gwno_a,gwno_a_2nd,gwno_b,gwno_b_2nd,gwno_loc,region,version
0,11342,India,Government of India,141,,GNLA,1163,,1,Garoland,...,1,2012-12-21,,750,,,,750,3,23.1
1,11342,India,Government of India,141,,GNLA,1163,,1,Garoland,...,1,2014-11-27,,750,,,,750,3,23.1
2,11343,"Egypt, Israel",Government of Egypt,117,,Government of Israel,121,,1,Suez/Sinai,...,1,1967-06-10,,651,,666.0,,"651, 666",2,23.1
3,11343,"Egypt, Israel",Government of Egypt,117,,Government of Israel,121,,1,Suez/Sinai,...,0,,,651,,666.0,,"651, 666",2,23.1
4,11343,"Egypt, Israel",Government of Egypt,117,,Government of Israel,121,,1,Suez/Sinai,...,1,1970-08-07,,651,,666.0,,"651, 666",2,23.1


In [5]:
df.columns

Index(['conflict_id', 'location', 'side_a', 'side_a_id', 'side_a_2nd',
       'side_b', 'side_b_id', 'side_b_2nd', 'incompatibility',
       'territory_name', 'year', 'intensity_level', 'cumulative_intensity',
       'type_of_conflict', 'start_date', 'start_prec', 'start_date2',
       'start_prec2', 'ep_end', 'ep_end_date', 'ep_end_prec', 'gwno_a',
       'gwno_a_2nd', 'gwno_b', 'gwno_b_2nd', 'gwno_loc', 'region', 'version'],
      dtype='object')

In [6]:
target_strings = ["Nigeria", "Ethiopia", "Egypt", "DR Congo", "Tanzania", "South Africa",
    "Kenya", "Uganda", "Sudan", "Algeria", "Morocco", "Angola", "Ghana",
    "Mozambique", "Madagascar", "Ivory Coast", "Cameroon", "Niger", "Mali",
    "Burkina Faso", "Malawi", "Zambia", "Chad", "Somalia", "Senegal", "Zimbabwe",
    "Guinea", "Rwanda", "Benin", "Burundi", "Tunisia", "South Sudan", "Togo",
    "Sierra Leone", "Libya", "Republic of the Congo", "Central African Republic",
    "Liberia", "Mauritania", "Eritrea", "Gambia", "Botswana", "Namibia", "Gabon",
    "Lesotho", "Guinea-Bissau", "Equatorial Guinea", "Mauritius", "Eswatini",
    "Djibouti", "Comoros", "Cape Verde", "Western Sahara",
    "São Tomé and Príncipe", "Seychelles"]

In [7]:
filtered_df = df[df['location'].isin(target_strings)]

In [8]:
filtered_df.head()

Unnamed: 0,conflict_id,location,side_a,side_a_id,side_a_2nd,side_b,side_b_id,side_b_2nd,incompatibility,territory_name,...,ep_end,ep_end_date,ep_end_prec,gwno_a,gwno_a_2nd,gwno_b,gwno_b_2nd,gwno_loc,region,version
6,11344,Sudan,Government of Sudan,112,,Republic of South Sudan,1129,,1,Abyei,...,1,2011-06-15,,625,,,,625,4,23.1
7,11345,South Sudan,Government of South Sudan,113,,"SSDM/A, SSLM/A","1108, 1124",,2,,...,0,,,626,,,,626,4,23.1
8,11345,South Sudan,Government of South Sudan,113,,SSLM/A,1124,,2,,...,0,,,626,,,,626,4,23.1
9,11345,South Sudan,Government of South Sudan,113,,"SPLM/A - IO, SSDM/A - Cobra Faction","3563, 4226",,2,,...,0,,,626,,,,626,4,23.1
10,11345,South Sudan,Government of South Sudan,113,Government of Uganda,SPLM/A - IO,4226,,2,,...,0,,,626,500.0,,,626,4,23.1


In [9]:
filtered_df.shape

(790, 28)

In [10]:
null_count = filtered_df["ep_end_date"].isnull().sum()
null_count

632

In [11]:
df2 = pd.read_csv('Nonstate_v23_1.csv', encoding='latin-1')

In [12]:
filtered_df2 = df2[df2['location'].isin(target_strings)]

In [13]:
filtered_df2.head()

Unnamed: 0,conflict_id,dyad_id,org,side_a_name,side_a_name_fulltext,side_a_name_mothertongue,side_a_id,side_a_components,side_a_2nd,gwno_a_2nd,...,ep_end_date,ep_end_prec,year,best_fatality_estimate,low_fatality_estimate,high_fatality_estimate,location,gwno_location,region,version
0,9488,10098,3,Fulani,Fulani,Fulani,607,,,,...,2021-06-09,1.0,2021,41,41,81,Nigeria,475,4,23.1
1,9637,10247,3,Konianke,Konianke,Konianke,1713,,,,...,2013-07-17,2.0,2013,98,98,98,Guinea,438,4,23.1
2,9654,10264,3,Arab,Arab,Arab,1012,,,,...,,0.0,2022,28,28,28,Sudan,625,4,23.1
3,9668,10278,3,Arab,Arab,Arab,1012,,,,...,,,2021,412,412,463,Sudan,625,4,23.1
4,9668,10278,3,Arab,Arab,Arab,1012,,,,...,,0.0,2022,231,231,231,Sudan,625,4,23.1


In [14]:
filtered_df2.shape

(784, 32)

In [15]:
selected_columns = ['location', 'start_date', 'start_date2', 'start_prec', 'start_prec2', 'ep_end_date']
column_filtered_df1 = filtered_df[selected_columns]
column_filtered_df1.head()

Unnamed: 0,location,start_date,start_date2,start_prec,start_prec2,ep_end_date
6,Sudan,2011-05-01,2011-05-19,1,1,2011-06-15
7,South Sudan,2011-08-20,2011-08-20,2,2,
8,South Sudan,2011-08-20,2011-08-20,2,2,
9,South Sudan,2011-08-20,2011-08-20,2,2,
10,South Sudan,2011-08-20,2011-08-20,2,2,


In [16]:
selected_columns = ['location', 'start_date', 'start_date2', 'start_prec', 'start_prec2', 'ep_end_date']
column_filtered_df2 = filtered_df2[selected_columns]
column_filtered_df2.head()

Unnamed: 0,location,start_date,start_date2,start_prec,start_prec2,ep_end_date
0,Nigeria,1998-01-18,2021-06-05,2,1,2021-06-09
1,Guinea,2013-07-15,2013-07-17,1,2,2013-07-17
2,Sudan,1989-09-30,2022-08-04,3,1,
3,Sudan,2020-07-19,2021-01-17,1,2,
4,Sudan,2020-07-19,2021-01-17,1,2,


In [17]:
combined_df = pd.concat([column_filtered_df2, column_filtered_df1], ignore_index=True)

In [18]:
combined_df.head()

Unnamed: 0,location,start_date,start_date2,start_prec,start_prec2,ep_end_date
0,Nigeria,1998-01-18,2021-06-05,2,1,2021-06-09
1,Guinea,2013-07-15,2013-07-17,1,2,2013-07-17
2,Sudan,1989-09-30,2022-08-04,3,1,
3,Sudan,2020-07-19,2021-01-17,1,2,
4,Sudan,2020-07-19,2021-01-17,1,2,


In [19]:
combined_df.shape

(1574, 6)

In [20]:
combined_df['start_date'] = pd.to_datetime(combined_df['start_date'])
combined_df['ep_end_date'] = pd.to_datetime(combined_df['ep_end_date'])
combined_df['duration'] = (combined_df['ep_end_date'] - combined_df['start_date']).dt.days

In [21]:
# Define labels based on duration
def label_duration(duration):
    if duration < 120:
        return 'short'
    elif duration < 200:
        return 'medium'
    else:
        return 'long'

In [22]:
combined_df['duration_label'] = combined_df['duration'].apply(label_duration)

In [40]:
X = combined_df[['start_prec', 'start_prec2']]
y = combined_df['duration_label']
y.value_counts()

long      1278
short      266
medium      30
Name: duration_label, dtype: int64

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [26]:
y_pred = model.predict(X_test)

In [27]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

        long       0.82      1.00      0.90       259
      medium       0.00      0.00      0.00         3
       short       0.00      0.00      0.00        53

    accuracy                           0.82       315
   macro avg       0.27      0.33      0.30       315
weighted avg       0.68      0.82      0.74       315



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Saving the model

In [28]:
import joblib

In [30]:
model_file = "my_model.joblib"
joblib.dump(model, model_file)

['my_model.joblib']

In [38]:
y_pred

array(['long', 'long', 'long', 'long', 'long', 'long', 'long', 'long',
       'long', 'long', 'long', 'long', 'long', 'long', 'long', 'long',
       'long', 'long', 'long', 'long', 'long', 'long', 'long', 'long',
       'long', 'long', 'long', 'long', 'long', 'long', 'long', 'long',
       'long', 'long', 'long', 'long', 'long', 'long', 'long', 'long',
       'long', 'long', 'long', 'long', 'long', 'long', 'long', 'long',
       'long', 'long', 'long', 'long', 'long', 'long', 'long', 'long',
       'long', 'long', 'long', 'long', 'long', 'long', 'long', 'long',
       'long', 'long', 'long', 'long', 'long', 'long', 'long', 'long',
       'long', 'long', 'long', 'long', 'long', 'long', 'long', 'long',
       'long', 'long', 'long', 'long', 'long', 'long', 'long', 'long',
       'long', 'long', 'long', 'long', 'long', 'long', 'long', 'long',
       'long', 'long', 'long', 'long', 'long', 'long', 'long', 'long',
       'long', 'long', 'long', 'long', 'long', 'long', 'long', 'long',
      