In [7]:
import random
import pandas as pd
import numpy as np
import scipy.stats as ss
import seaborn as sns
import plotly.graph_objs as go
from plotly.offline import plot
from feature_engine.encoding import OrdinalEncoder
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score

In [2]:
data = pd.read_pickle('../data/aus_weather_cln.pkl')
data

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.900000,0.6,1.4,9.5,0.073518,44.0,0.062725,...,71.0,22.0,1007.7,1007.1,8.0,4.0,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.100000,0.0,0.8,0.3,0.060821,44.0,0.059164,...,44.0,25.0,1010.6,1007.8,7.0,0.0,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.700000,0.0,6.2,4.7,0.067063,46.0,0.062725,...,38.0,30.0,1007.6,1008.7,0.0,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.000000,0.0,3.8,3.0,0.052729,24.0,0.068892,...,45.0,16.0,1017.6,1012.8,4.0,3.0,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.300000,1.0,3.6,11.6,0.073518,41.0,0.058243,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145455,2017-06-21,Uluru,2.8,23.400000,0.0,5.0,10.8,0.068074,31.0,0.068892,...,51.0,24.0,1024.6,1020.3,8.0,7.0,10.1,22.4,No,No
145456,2017-06-22,Uluru,3.6,25.300000,0.0,5.4,5.6,0.048900,22.0,0.068892,...,56.0,21.0,1023.5,1019.1,4.0,7.0,10.9,24.5,No,No
145457,2017-06-23,Uluru,5.4,26.900000,0.0,3.2,0.0,0.068747,37.0,0.068892,...,53.0,24.0,1021.0,1016.8,4.0,0.0,12.5,26.1,No,No
145458,2017-06-24,Uluru,7.8,27.000000,0.0,1.6,6.6,0.069847,28.0,0.067737,...,51.0,24.0,1019.4,1016.5,3.0,2.0,15.1,26.0,No,No


In [3]:
data['Date'] = pd.to_datetime(data['Date'])

In [4]:
data['RainToday'] = data['RainToday'].apply(lambda x: 1 if x == 'Yes' else 0)
data['RainTomorrow'] = data['RainTomorrow'].apply(lambda x: 1 if x == 'Yes' else 0)
data[['RainToday', 'RainTomorrow']]

Unnamed: 0,RainToday,RainTomorrow
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
145455,0,0
145456,0,0
145457,0,0
145458,0,0


In [5]:
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Quarter'] = data['Date'].dt.quarter
data

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Year,Month,Quarter
0,2008-12-01,Albury,13.4,22.900000,0.6,1.4,9.5,0.073518,44.0,0.062725,...,1007.1,8.0,4.0,16.9,21.8,0,0,2008,12,4
1,2008-12-02,Albury,7.4,25.100000,0.0,0.8,0.3,0.060821,44.0,0.059164,...,1007.8,7.0,0.0,17.2,24.3,0,0,2008,12,4
2,2008-12-03,Albury,12.9,25.700000,0.0,6.2,4.7,0.067063,46.0,0.062725,...,1008.7,0.0,2.0,21.0,23.2,0,0,2008,12,4
3,2008-12-04,Albury,9.2,28.000000,0.0,3.8,3.0,0.052729,24.0,0.068892,...,1012.8,4.0,3.0,18.1,26.5,0,0,2008,12,4
4,2008-12-05,Albury,17.5,32.300000,1.0,3.6,11.6,0.073518,41.0,0.058243,...,1006.0,7.0,8.0,17.8,29.7,0,0,2008,12,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145455,2017-06-21,Uluru,2.8,23.400000,0.0,5.0,10.8,0.068074,31.0,0.068892,...,1020.3,8.0,7.0,10.1,22.4,0,0,2017,6,2
145456,2017-06-22,Uluru,3.6,25.300000,0.0,5.4,5.6,0.048900,22.0,0.068892,...,1019.1,4.0,7.0,10.9,24.5,0,0,2017,6,2
145457,2017-06-23,Uluru,5.4,26.900000,0.0,3.2,0.0,0.068747,37.0,0.068892,...,1016.8,4.0,0.0,12.5,26.1,0,0,2017,6,2
145458,2017-06-24,Uluru,7.8,27.000000,0.0,1.6,6.6,0.069847,28.0,0.067737,...,1016.5,3.0,2.0,15.1,26.0,0,0,2017,6,2


In [6]:
data.set_index('Date', inplace=True)
data

Unnamed: 0_level_0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Year,Month,Quarter
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-12-01,Albury,13.4,22.900000,0.6,1.4,9.5,0.073518,44.0,0.062725,0.062739,...,1007.1,8.0,4.0,16.9,21.8,0,0,2008,12,4
2008-12-02,Albury,7.4,25.100000,0.0,0.8,0.3,0.060821,44.0,0.059164,0.067469,...,1007.8,7.0,0.0,17.2,24.3,0,0,2008,12,4
2008-12-03,Albury,12.9,25.700000,0.0,6.2,4.7,0.067063,46.0,0.062725,0.067469,...,1008.7,0.0,2.0,21.0,23.2,0,0,2008,12,4
2008-12-04,Albury,9.2,28.000000,0.0,3.8,3.0,0.052729,24.0,0.068892,0.059989,...,1012.8,4.0,3.0,18.1,26.5,0,0,2008,12,4
2008-12-05,Albury,17.5,32.300000,1.0,3.6,11.6,0.073518,41.0,0.058243,0.060876,...,1006.0,7.0,8.0,17.8,29.7,0,0,2008,12,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-06-21,Uluru,2.8,23.400000,0.0,5.0,10.8,0.068074,31.0,0.068892,0.055685,...,1020.3,8.0,7.0,10.1,22.4,0,0,2017,6,2
2017-06-22,Uluru,3.6,25.300000,0.0,5.4,5.6,0.048900,22.0,0.068892,0.062849,...,1019.1,4.0,7.0,10.9,24.5,0,0,2017,6,2
2017-06-23,Uluru,5.4,26.900000,0.0,3.2,0.0,0.068747,37.0,0.068892,0.062739,...,1016.8,4.0,0.0,12.5,26.1,0,0,2017,6,2
2017-06-24,Uluru,7.8,27.000000,0.0,1.6,6.6,0.069847,28.0,0.067737,0.062849,...,1016.5,3.0,2.0,15.1,26.0,0,0,2017,6,2


In [8]:
location_encoder = OrdinalEncoder(encoding_method='arbitrary',
                                  variables=['Location'])
location_encoder.fit(data)
# location_encoder.encoder_dict_
data = location_encoder.transform(data)
data

Unnamed: 0_level_0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Year,Month,Quarter
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-12-01,0,13.4,22.900000,0.6,1.4,9.5,0.073518,44.0,0.062725,0.062739,...,1007.1,8.0,4.0,16.9,21.8,0,0,2008,12,4
2008-12-02,0,7.4,25.100000,0.0,0.8,0.3,0.060821,44.0,0.059164,0.067469,...,1007.8,7.0,0.0,17.2,24.3,0,0,2008,12,4
2008-12-03,0,12.9,25.700000,0.0,6.2,4.7,0.067063,46.0,0.062725,0.067469,...,1008.7,0.0,2.0,21.0,23.2,0,0,2008,12,4
2008-12-04,0,9.2,28.000000,0.0,3.8,3.0,0.052729,24.0,0.068892,0.059989,...,1012.8,4.0,3.0,18.1,26.5,0,0,2008,12,4
2008-12-05,0,17.5,32.300000,1.0,3.6,11.6,0.073518,41.0,0.058243,0.060876,...,1006.0,7.0,8.0,17.8,29.7,0,0,2008,12,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-06-21,48,2.8,23.400000,0.0,5.0,10.8,0.068074,31.0,0.068892,0.055685,...,1020.3,8.0,7.0,10.1,22.4,0,0,2017,6,2
2017-06-22,48,3.6,25.300000,0.0,5.4,5.6,0.048900,22.0,0.068892,0.062849,...,1019.1,4.0,7.0,10.9,24.5,0,0,2017,6,2
2017-06-23,48,5.4,26.900000,0.0,3.2,0.0,0.068747,37.0,0.068892,0.062739,...,1016.8,4.0,0.0,12.5,26.1,0,0,2017,6,2
2017-06-24,48,7.8,27.000000,0.0,1.6,6.6,0.069847,28.0,0.067737,0.062849,...,1016.5,3.0,2.0,15.1,26.0,0,0,2017,6,2


### regression feature selection

In [9]:
temp = data.drop('RainTomorrow', axis=1)
y = data['RainTomorrow']
features = SelectKBest(score_func=f_regression, k=10)
selected_features = features.fit_transform(temp, y)
selected_features

array([[ 0.6,  9.5, 44. , ...,  8. ,  4. ,  0. ],
       [ 0. ,  0.3, 44. , ...,  7. ,  0. ,  0. ],
       [ 0. ,  4.7, 46. , ...,  0. ,  2. ,  0. ],
       ...,
       [ 0. ,  0. , 37. , ...,  4. ,  0. ,  0. ],
       [ 0. ,  6.6, 28. , ...,  3. ,  2. ,  0. ],
       [ 0. , 10.9, 54. , ...,  8. ,  8. ,  0. ]])

In [10]:
input_feature = ['Location', 'Rainfall', 'Sunshine', 'WindGustSpeed', 'Humidity9am', 'Humidity3pm',
                 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'RainToday']
train_x, test_x, train_y, test_y = train_test_split(temp[input_feature], y, test_size=0.2, random_state=64)
train_x.shape, test_x.shape

((116368, 11), (29092, 11))

### model building

In [11]:
# baseline model - n_neighbors = 3
model_b = KNN(n_neighbors=3)
model_b.fit(train_x,train_y)

KNeighborsClassifier(n_neighbors=3)

In [13]:
# score with train set
model_b.score(train_x,train_y)

0.8911900178743297

In [14]:
# score with test set
model_b.score(test_x,test_y)

0.8158600302488657