### Plan of the notebook

1. Use kNN to find k most similar countries for North Korea

In [120]:
import pandas as pd
import numpy as  np

from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

In [121]:
k_nearest_countries = 6
country_to_estimate = 'Korea, Dem. People’s Rep.'

In [122]:
data_raw = pd.read_csv('countries_data_preprocessed.csv')

In [123]:
data_raw

Unnamed: 0,name,longitude,latitude,East Asia & Pacific,Europe & Central Asia,Latin America & Caribbean,Middle East & North Africa,North America,South Asia,Sub-Saharan Africa,income_level
0,Andorra,1.5218,42.5075,0,1,0,0,0,0,0,4
1,United Arab Emirates,54.3705,24.4764,0,0,0,1,0,0,0,4
2,Afghanistan,69.1761,34.5228,0,0,0,0,0,1,0,1
3,Antigua and Barbuda,-61.8456,17.1175,0,0,1,0,0,0,0,4
4,Albania,19.8172,41.3317,0,1,0,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...
209,Serbia,20.4656,44.8024,0,1,0,0,0,0,0,3
210,South Africa,28.1871,-25.7460,0,0,0,0,0,0,1,3
211,Zambia,28.2937,-15.3982,0,0,0,0,0,0,1,2
212,"Congo, Dem. Rep.",15.3222,-4.3250,0,0,0,0,0,0,1,1


In [124]:
conutry_to_estimate_index = data_raw[data_raw['name'] == country_to_estimate].index.asi8[0]

In [125]:
conutry_to_estimate_index

99

#### 1. Use kNN to find k most similar countries for North Korea

In [126]:
country_names = data_raw['name']

In [127]:
data = data_raw.drop(['name'], axis = 1)

In [128]:
data

Unnamed: 0,longitude,latitude,East Asia & Pacific,Europe & Central Asia,Latin America & Caribbean,Middle East & North Africa,North America,South Asia,Sub-Saharan Africa,income_level
0,1.5218,42.5075,0,1,0,0,0,0,0,4
1,54.3705,24.4764,0,0,0,1,0,0,0,4
2,69.1761,34.5228,0,0,0,0,0,1,0,1
3,-61.8456,17.1175,0,0,1,0,0,0,0,4
4,19.8172,41.3317,0,1,0,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...
209,20.4656,44.8024,0,1,0,0,0,0,0,3
210,28.1871,-25.7460,0,0,0,0,0,0,1,3
211,28.2937,-15.3982,0,0,0,0,0,0,1,2
212,15.3222,-4.3250,0,0,0,0,0,0,1,1


In [129]:
standardizer = StandardScaler()

In [130]:
data_standardized = standardizer.fit_transform(data)

In [131]:
data_standardized

array([[-0.2563548 ,  0.9847195 , -0.46466019, ..., -0.18389243,
        -0.54494926,  1.0999858 ],
       [ 0.49844538,  0.2385692 , -0.46466019, ..., -0.18389243,
        -0.54494926,  1.0999858 ],
       [ 0.70990318,  0.65430226, -0.46466019, ...,  5.4379618 ,
        -0.54494926, -1.70235897],
       ...,
       [ 0.1260091 , -1.41149345, -0.46466019, ..., -0.18389243,
         1.83503323, -0.76824405],
       [-0.05925356, -0.95327008, -0.46466019, ..., -0.18389243,
         1.83503323, -1.70235897],
       [ 0.16562102, -1.51217415, -0.46466019, ..., -0.18389243,
         1.83503323, -0.76824405]])

In [132]:
data_standardized.shape

(214, 10)

In [133]:
# keep and remove the country we want to estimate for
record_of_country_to_estimate = data_standardized[conutry_to_estimate_index]
data_standardized_no_country_to_estimate = np.delete(data_standardized, conutry_to_estimate_index, 0)

In [134]:
record_of_country_to_estimate

array([ 1.51796486,  0.84089467,  2.15211035, -0.60254238, -0.47207748,
       -0.32986102, -0.1192393 , -0.18389243, -0.54494926, -1.70235897])

In [135]:
data_standardized_no_country_to_estimate.shape

(213, 10)

In [136]:
nearest_countries_result = NearestNeighbors(n_neighbors=k_nearest_countries).fit(data_standardized_no_country_to_estimate)

In [137]:
distances, indices = nearest_countries_result.kneighbors([record_of_country_to_estimate])

In [138]:
distances

array([[1.03916251, 1.22880003, 1.24718273, 1.30462482, 1.37961748,
        1.50142684]])

In [139]:
indices

array([[123, 203, 122, 103, 149,  95]])

In [140]:
nearest_countries_list = country_names[np.array(indices[0])]

In [141]:
nearest_countries_list

123                  Myanmar
203    Virgin Islands (U.S.)
122                     Mali
103               Kazakhstan
149         Papua New Guinea
95                  Cambodia
Name: name, dtype: object

In [142]:
nearest_countries_list.to_csv('nearest_countries.csv', index=False)