In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [2]:
data = pd.read_csv('monthly-data-labelled.csv')

In [3]:
#X = data.loc[:, 'tmax-01':'snwd-12'].values
X = data.drop(columns = ['city','year']).values

In [4]:
y = data['city'].values

In [5]:
unknown = pd.read_csv('monthly-data-unlabelled.csv')

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(X,y)

In [7]:
knn_model = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=4))

In [8]:
knn_model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('kneighborsclassifier',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=None, n_neighbors=4, p=2,
                                      weights='uniform'))],
         verbose=False)

In [9]:
knn_model.score(X_valid, y_valid)

0.6137931034482759

In [10]:
#unknown_x = unknown.loc[:,'tmax-01':'snwd-12'].values
unknown_x = unknown.drop(columns = ['city','year']).values

In [11]:
predictions = knn_model.predict(unknown_x)

In [12]:
predictions

array(['Miami', 'Vancouver', 'Denver', 'Seattle', 'Atlantic City',
       'Atlanta', 'New Orleans', 'Portland', 'San Francisco',
       'Atlantic City', 'Anchorage', 'Los Angeles'], dtype=object)

In [13]:
svc_model = make_pipeline(StandardScaler(), SVC(kernel='linear', C=0.1))

In [14]:
svc_model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svc',
                 SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
                     decision_function_shape='ovr', degree=3,
                     gamma='auto_deprecated', kernel='linear', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

In [15]:
svc_model.score(X_valid, y_valid)

0.7793103448275862

In [16]:
predictions_svc = svc_model.predict(unknown_x)

In [17]:
predictions_svc

array(['Miami', 'Vancouver', 'Denver', 'Seattle', 'Atlantic City',
       'Raleigh Durham', 'New Orleans', 'Portland', 'San Francisco',
       'Chicago', 'Anchorage', 'Los Angeles'], dtype=object)

In [18]:
bayes_model = make_pipeline(StandardScaler(), GaussianNB())
bayes_model.fit(X_train, y_train)
bayes_model.score(X_valid, y_valid)

0.6551724137931034

In [19]:
rf_model = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=400, max_depth=10, min_samples_leaf=5))
rf_model.fit(X_train, y_train)
rf_model.score(X_valid, y_valid)

0.7344827586206897

In [20]:
predictions_rf = rf_model.predict(unknown_x)

In [21]:
predictions_rf

array(['Miami', 'Vancouver', 'Denver', 'Seattle', 'Atlantic City',
       'Raleigh Durham', 'New Orleans', 'Portland', 'San Francisco',
       'Chicago', 'Calgary', 'Los Angeles'], dtype=object)