## Импорт библиотек

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
%matplotlib inline 
sns.set(style="ticks")

import country_converter as coco

## Загрузка данных

In [2]:
original_dataset = pd.read_csv('../worldcitiespop.csv', sep=",", low_memory=False)

## Кодирование категориальных признаков

In [3]:
enc = LabelEncoder()
original_dataset["Country_encoded"] = enc.fit_transform(original_dataset["Country"])

## Обработка пропусков данных

In [4]:
original_dataset.isnull().sum()

Country                  0
City                     6
AccentCity               0
Region                   8
Population         3125978
Latitude                 0
Longitude                0
Country_encoded          0
dtype: int64

In [5]:
cleansed_dataset = original_dataset.drop("Population", axis=1).dropna()
cleansed_dataset.isnull().sum()

Country            0
City               0
AccentCity         0
Region             0
Latitude           0
Longitude          0
Country_encoded    0
dtype: int64

## Уменьшение размера выборки

In [6]:
reduced_dataset = cleansed_dataset.sample(n=1000, random_state=10)
reduced_dataset.shape

(1000, 7)

## Разбиение выборки на обучающую и тестовою

In [7]:
X = reduced_dataset[["Latitude", "Longitude"]]
Y = reduced_dataset["Country_encoded"]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=10)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((700, 2), (300, 2), (700,), (300,))

## Использование библиотеки TPOT для автоматического подбора модели

In [12]:
from tpot import TPOTClassifier
pipeline_optimizer = TPOTClassifier(generations=10, population_size=20, cv=3, random_state=10, verbosity=2)

In [13]:
%%time
pipeline_optimizer.fit(X_train, Y_train)


Generation 1 - Current best internal CV score: 0.7471112578408716

Generation 2 - Current best internal CV score: 0.7471112578408716

Generation 3 - Current best internal CV score: 0.7599867943215584

Generation 4 - Current best internal CV score: 0.7599867943215584

Generation 5 - Current best internal CV score: 0.7628480246505998

Generation 6 - Current best internal CV score: 0.7628480246505998

Generation 7 - Current best internal CV score: 0.7857134123228544

Generation 8 - Current best internal CV score: 0.7857134123228544

Generation 9 - Current best internal CV score: 0.7857134123228544

Generation 10 - Current best internal CV score: 0.7857134123228544
                                                                                
Best pipeline: KNeighborsClassifier(input_matrix, n_neighbors=14, p=1, weights=distance)
Wall time: 5h 56min 34s


TPOTClassifier(cv=3, generations=10, population_size=20, random_state=10,
               verbosity=2)

In [22]:
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier(n_neighbors=14, p=1, weights='distance')
knc.fit(X_train, Y_train)

KNeighborsClassifier(n_neighbors=14, p=1, weights='distance')

In [27]:
Y_pred = knc.predict(X_test)
precision = precision_score(Y_pred, Y_test, average="weighted", zero_division=0)
recall = recall_score(Y_pred, Y_test, average="weighted", zero_division=0)
f1 = f1_score(Y_pred, Y_test, average="weighted")
precision, recall, f1

(0.8610216450216451, 0.7666666666666667, 0.7972308903602368)