In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import export_graphviz
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
# read datasets 
month_lab = pd.read_csv('monthly-data-labelled.csv')
month_noLab = pd.read_csv('monthly-data-unlabelled.csv')

In [3]:
# saperate features and target in dfs: X, y 
X = month_lab.loc[:, month_lab.columns != 'city']
y = month_lab['city']
# split X & y values to training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y)


In [4]:
# train and validate a ml model on lablled daatset 
model1 = make_pipeline(
    StandardScaler(),
    RandomForestClassifier(n_estimators=250, max_depth=10)
)
model1.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestclassifier',
                 RandomForestClassifier(max_depth=10, n_estimators=250))])

In [5]:
# print model's val-score
print("Validation score:", model1.score(X_valid, y_valid).round(3))

Validation score: 0.738


In [6]:
# train and validate a ml model on lablled daatset 
model2 = make_pipeline(
    StandardScaler(),
    KNeighborsClassifier(n_neighbors=15)
)
model2.fit(X_train, y_train)
model2.score(X_valid, y_valid)

0.7137931034482758

In [7]:
# predict non-labelled cities from other dataset
temp = month_noLab[month_noLab['year'] == 2016]
X_test = temp.loc[:, temp.columns != 'city']
y_test = model1.predict(X_test)

In [8]:
y_test

array(['Miami', 'Vancouver', 'Denver', 'Seattle', 'Atlantic City',
       'Atlanta', 'New Orleans', 'Portland', 'San Francisco', 'Chicago',
       'Calgary', 'Los Angeles'], dtype=object)

In [9]:
# save to file 
result = pd.Series(y_test)
result.to_csv('labels.csv', index=False, header=False)

In [10]:
df = pd.DataFrame({'truth': y_valid, 'prediction': model1.predict(X_valid)})
print(df[df['truth'] != df['prediction']])

          truth     prediction
877   Saskatoon       Winnipeg
907   Saskatoon         Regina
670    Portland        Seattle
255      Denver  Atlantic City
1084   Victoria      Vancouver
...         ...            ...
887   Saskatoon         Regina
1053  Vancouver       Victoria
917     Seattle       Victoria
882   Saskatoon         Regina
1132   Winnipeg      Saskatoon

[76 rows x 2 columns]


In [14]:
df['prediction'].shape

(290,)