In [23]:
'''Year, COA, age group, number of applications to get COO
'''

'Year, COA, age group, number of applications to get COO\n'

In [2]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image

In [3]:
def get_data(url):
    
    category = str(url)
    response = requests.get(category)
    data = response.json()
    data_dict = data["items"]
    
    data = pd.DataFrame.from_records(data_dict)
    
    return data

In [4]:
demographics = get_data('https://api.unhcr.org/population/v1/demographics/?&yearFrom=2010&yearTo=2025&coo_all=TRUE&limit=10000000&coa_all=TRUE')

In [5]:
demographics

Unnamed: 0,year,coo_id,coo_name,coo,coo_iso,coa_id,coa_name,coa,coa_iso,f_0_4,...,f_other,f_total,m_0_4,m_5_11,m_12_17,m_18_59,m_60,m_other,m_total,total
0,2010,2,Afghanistan,AFG,AFG,2,Afghanistan,AFG,AFG,70435,...,18,642683,72389,187679,114272,265496,23938,31,663805,1311554
1,2010,91,Iran (Islamic Rep. of),IRN,IRN,2,Afghanistan,AFG,AFG,0,...,0,16,0,5,5,20,0,0,30,51
2,2010,92,Iraq,IRQ,IRQ,2,Afghanistan,AFG,AFG,0,...,0,0,6,0,0,0,0,0,6,6
3,2010,147,Pakistan,PAK,PAK,2,Afghanistan,AFG,AFG,0,...,0,7,0,0,0,9,0,0,9,6407
4,2010,8,Egypt,ARE,EGY,3,Albania,ALB,ALB,5,...,0,5,0,0,0,0,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64455,2022,207,Venezuela (Bolivarian Republic of),VEN,VEN,224,Aruba,ABW,ABW,0,...,9000,9000,0,0,0,0,0,8000,8000,17000
64456,2022,224,Aruba,ABW,ABW,224,Aruba,ABW,ABW,0,...,901,901,0,0,0,0,0,799,799,1700
64457,2022,207,Venezuela (Bolivarian Republic of),VEN,VEN,254,Curacao,CUW,CUW,0,...,8000,8000,0,0,0,0,0,6000,6000,14000
64458,2022,254,Curacao,CUW,CUW,254,Curacao,CUW,CUW,0,...,901,901,0,0,0,0,0,799,799,1700


In [6]:
demographics[['year','f_0_4', 'f_5_11', 'f_12_17', 'f_18_59', 'f_60', 'f_other', 'f_total', 'm_0_4', 'm_5_11', 'm_12_17', 'm_18_59', 'm_60', 'm_other', 'm_total', 'total']] = demographics[['year', 'f_0_4', 'f_5_11', 'f_12_17', 'f_18_59', 'f_60', 'f_other', 'f_total', 'm_0_4', 'm_5_11', 'm_12_17', 'm_18_59', 'm_60', 'm_other', 'm_total', 'total']].astype(int)
demographics = demographics.drop(['coo', 'coo_iso', 'coa', 'coa_iso'], axis=1)
demographics = demographics.dropna()
demographics

Unnamed: 0,year,coo_id,coo_name,coa_id,coa_name,f_0_4,f_5_11,f_12_17,f_18_59,f_60,f_other,f_total,m_0_4,m_5_11,m_12_17,m_18_59,m_60,m_other,m_total,total
0,2010,2,Afghanistan,2,Afghanistan,70435,180578,103913,272816,14923,18,642683,72389,187679,114272,265496,23938,31,663805,1311554
1,2010,91,Iran (Islamic Rep. of),2,Afghanistan,0,0,0,16,0,0,16,0,5,5,20,0,0,30,51
2,2010,92,Iraq,2,Afghanistan,0,0,0,0,0,0,0,6,0,0,0,0,0,6,6
3,2010,147,Pakistan,2,Afghanistan,0,0,0,7,0,0,7,0,0,0,9,0,0,9,6407
4,2010,8,Egypt,3,Albania,5,0,0,0,0,0,5,0,0,0,0,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64455,2022,207,Venezuela (Bolivarian Republic of),224,Aruba,0,0,0,0,0,9000,9000,0,0,0,0,0,8000,8000,17000
64456,2022,224,Aruba,224,Aruba,0,0,0,0,0,901,901,0,0,0,0,0,799,799,1700
64457,2022,207,Venezuela (Bolivarian Republic of),254,Curacao,0,0,0,0,0,8000,8000,0,0,0,0,0,6000,6000,14000
64458,2022,254,Curacao,254,Curacao,0,0,0,0,0,901,901,0,0,0,0,0,799,799,1700


In [7]:
demographics = demographics[demographics["coo_id"] != demographics["coa_id"]]

In [8]:
demographics

Unnamed: 0,year,coo_id,coo_name,coa_id,coa_name,f_0_4,f_5_11,f_12_17,f_18_59,f_60,f_other,f_total,m_0_4,m_5_11,m_12_17,m_18_59,m_60,m_other,m_total,total
1,2010,91,Iran (Islamic Rep. of),2,Afghanistan,0,0,0,16,0,0,16,0,5,5,20,0,0,30,51
2,2010,92,Iraq,2,Afghanistan,0,0,0,0,0,0,0,6,0,0,0,0,0,6,6
3,2010,147,Pakistan,2,Afghanistan,0,0,0,7,0,0,7,0,0,0,9,0,0,9,6407
4,2010,8,Egypt,3,Albania,5,0,0,0,0,0,5,0,0,0,0,0,0,0,5
5,2010,37,China,3,Albania,0,0,0,0,0,0,0,0,0,0,6,0,0,6,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64453,2022,81,Haiti,223,Anguilla,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14
64454,2022,207,Venezuela (Bolivarian Republic of),223,Anguilla,0,0,0,0,0,0,0,0,0,0,0,0,0,0,25
64455,2022,207,Venezuela (Bolivarian Republic of),224,Aruba,0,0,0,0,0,9000,9000,0,0,0,0,0,8000,8000,17000
64457,2022,207,Venezuela (Bolivarian Republic of),254,Curacao,0,0,0,0,0,8000,8000,0,0,0,0,0,6000,6000,14000


In [9]:
# melting data
df_melted = pd.melt(demographics, id_vars=["year", "coo_name"], value_vars=["f_0_4", "f_5_11", "f_12_17", "f_18_59", "f_60", "m_0_4", "m_5_11", "m_12_17", "m_18_59", "m_60"],
                    var_name="age_group", value_name="number")

# splitting 
df_melted['gender'] = df_melted['age_group'].str[0].replace({'f': 'F', 'm': 'M'})
df_melted['age_group'] = df_melted['age_group'].str[2:]
df_melted = df_melted[["year", "coo_name", "gender", "age_group", "number"]]
df_melted

Unnamed: 0,year,coo_name,gender,age_group,number
0,2010,Iran (Islamic Rep. of),F,0_4,0
1,2010,Iraq,F,0_4,0
2,2010,Pakistan,F,0_4,0
3,2010,Egypt,F,0_4,5
4,2010,China,F,0_4,0
...,...,...,...,...,...
638025,2022,Haiti,M,60,0
638026,2022,Venezuela (Bolivarian Republic of),M,60,0
638027,2022,Venezuela (Bolivarian Republic of),M,60,0
638028,2022,Venezuela (Bolivarian Republic of),M,60,0


In [10]:
df_melted.to_csv('demo_final_2.csv', index=False, header=True)

In [11]:
df = pd.read_csv('demo_final_2.csv')

In [12]:
df['gender'] = df['gender'].map({'F': 0, 'M': 1})
df

Unnamed: 0,year,coo_name,gender,age_group,number
0,2010,Iran (Islamic Rep. of),0,0_4,0
1,2010,Iraq,0,0_4,0
2,2010,Pakistan,0,0_4,0
3,2010,Egypt,0,0_4,5
4,2010,China,0,0_4,0
...,...,...,...,...,...
638025,2022,Haiti,1,60,0
638026,2022,Venezuela (Bolivarian Republic of),1,60,0
638027,2022,Venezuela (Bolivarian Republic of),1,60,0
638028,2022,Venezuela (Bolivarian Republic of),1,60,0


In [13]:
a = df['coo_name'].unique()
a

array(['Iran (Islamic Rep. of)', 'Iraq', 'Pakistan', 'Egypt', 'China',
       'Palestinian', 'Serbia and Kosovo: S/RES/1244 (1999)', 'Türkiye',
       'Angola', 'Benin', 'Chad', 'Cameroon', 'Congo',
       'Dem. Rep. of the Congo', 'Guinea', "Cote d'Ivoire", 'Liberia',
       'Libya', 'Niger', 'Nigeria', 'Somalia', 'Sudan', 'Western Sahara',
       'Burundi', 'Central African Rep.', 'Eritrea', 'Ethiopia',
       'Guinea-Bissau', 'Mauritania', 'Rwanda', 'Senegal', 'Sierra Leone',
       'United Rep. of Tanzania', 'Unknown ', 'Algeria', 'Djibouti',
       'Kazakhstan', 'Mali', 'Russian Federation', 'Saudi Arabia',
       'Syrian Arab Rep.', 'Tajikistan', 'Turkmenistan', 'Tunisia',
       'Uganda', 'Uzbekistan', 'Yemen', 'Zimbabwe', 'Albania', 'Armenia',
       'Bangladesh', 'Bolivia (Plurinational State of)',
       'Bosnia and Herzegovina', 'Chile', 'Colombia', 'Costa Rica',
       'Cuba', 'Dominican Rep.', 'Ecuador', 'Georgia', 'Ghana', 'Haiti',
       'Honduras', 'Hungary', 'India', '

In [14]:
df = pd.get_dummies(df, columns=['age_group'], drop_first=True)


In [15]:
df

Unnamed: 0,year,coo_name,gender,number,age_group_12_17,age_group_18_59,age_group_5_11,age_group_60
0,2010,Iran (Islamic Rep. of),0,0,0,0,0,0
1,2010,Iraq,0,0,0,0,0,0
2,2010,Pakistan,0,0,0,0,0,0
3,2010,Egypt,0,5,0,0,0,0
4,2010,China,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
638025,2022,Haiti,1,0,0,0,0,1
638026,2022,Venezuela (Bolivarian Republic of),1,0,0,0,0,1
638027,2022,Venezuela (Bolivarian Republic of),1,0,0,0,0,1
638028,2022,Venezuela (Bolivarian Republic of),1,0,0,0,0,1


In [16]:
1/210

0.004761904761904762

In [17]:
a = df['coo_name'].unique()
len(a)

210

In [18]:
factor = pd.factorize(df['coo_name'])
df.coo_name = factor[0]
indx = factor[1]

In [19]:
df

Unnamed: 0,year,coo_name,gender,number,age_group_12_17,age_group_18_59,age_group_5_11,age_group_60
0,2010,0,0,0,0,0,0,0
1,2010,1,0,0,0,0,0,0
2,2010,2,0,0,0,0,0,0
3,2010,3,0,5,0,0,0,0
4,2010,4,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
638025,2022,61,1,0,0,0,0,1
638026,2022,75,1,0,0,0,0,1
638027,2022,75,1,0,0,0,0,1
638028,2022,75,1,0,0,0,0,1


In [20]:
from sklearn.preprocessing import StandardScaler

In [22]:
X = df.drop('coo_name', axis=1)
y = df['coo_name']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

rf = RandomForestClassifier()

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.01618262464147454
