In [1]:
import pandas as pd
import matplotlib as plt
import scipy.stats as stats
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [2]:
df = pd.read_csv('data/weatherAUS.csv')
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [3]:
df["Date"] = pd.to_datetime(df["Date"])
df["Month"] = df["Date"].dt.month

In [4]:
df.dropna(subset = ['RainTomorrow'], inplace = True)

In [5]:
df.drop(columns = ['Evaporation', 'Date', 'Sunshine'], inplace=True, axis=1)

In [6]:
df.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Month
0,Albury,13.4,22.9,0.6,W,44.0,W,WNW,20.0,24.0,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No,12
1,Albury,7.4,25.1,0.0,WNW,44.0,NNW,WSW,4.0,22.0,...,25.0,1010.6,1007.8,,,17.2,24.3,No,No,12
2,Albury,12.9,25.7,0.0,WSW,46.0,W,WSW,19.0,26.0,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No,12
3,Albury,9.2,28.0,0.0,NE,24.0,SE,E,11.0,9.0,...,16.0,1017.6,1012.8,,,18.1,26.5,No,No,12
4,Albury,17.5,32.3,1.0,W,41.0,ENE,NW,7.0,20.0,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No,12


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 142193 entries, 0 to 145458
Data columns (total 21 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Location       142193 non-null  object 
 1   MinTemp        141556 non-null  float64
 2   MaxTemp        141871 non-null  float64
 3   Rainfall       140787 non-null  float64
 4   WindGustDir    132863 non-null  object 
 5   WindGustSpeed  132923 non-null  float64
 6   WindDir9am     132180 non-null  object 
 7   WindDir3pm     138415 non-null  object 
 8   WindSpeed9am   140845 non-null  float64
 9   WindSpeed3pm   139563 non-null  float64
 10  Humidity9am    140419 non-null  float64
 11  Humidity3pm    138583 non-null  float64
 12  Pressure9am    128179 non-null  float64
 13  Pressure3pm    128212 non-null  float64
 14  Cloud9am       88536 non-null   float64
 15  Cloud3pm       85099 non-null   float64
 16  Temp9am        141289 non-null  float64
 17  Temp3pm        139467 non-nul

In [8]:
df2 = df.copy()

In [9]:
features = ['MinTemp', 'MaxTemp', 'Rainfall', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
            'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Temp9am', 'Temp3pm']
for feature in features:
    qupper = df2[feature].quantile(0.99)
    df2 = df2.loc[(df2[feature].isna()) | (df2[feature] < qupper)]

In [10]:
df2.isna().sum()

Location             0
MinTemp            581
MaxTemp            295
Rainfall          1239
WindGustDir       8494
WindGustSpeed     8442
WindDir9am        8591
WindDir3pm        3372
WindSpeed9am      1241
WindSpeed3pm      2430
Humidity9am       1570
Humidity3pm       3118
Pressure9am      12545
Pressure3pm      12518
Cloud9am         46856
Cloud3pm         49824
Temp9am            796
Temp3pm           2321
RainToday         1239
RainTomorrow         0
Month                0
dtype: int64

In [11]:
X = df2.drop('RainTomorrow', axis=1)
y = df2['RainTomorrow']

In [12]:
X_train, y_train, X_test, y_test = train_test_split(X, y, random_state=42)

In [13]:
cols_simp = ['WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']
simputer = SimpleImputer(strategy = 'most_frequent')
simputed = simputer.fit_transform(X_train[cols_simp])
df_simp_tranformed = pd.DataFrame(simputed, index=X_train.index, columns = cols_simp)
df_simp_tranformed

Unnamed: 0,WindGustDir,WindDir9am,WindDir3pm,RainToday
2867,NNW,SE,NE,No
110045,W,SW,SE,No
55289,N,NE,NNE,No
39713,W,WNW,WNW,No
114443,S,ESE,SSE,No
...,...,...,...,...
139707,WNW,WSW,WNW,No
121459,SE,SE,SE,No
940,E,NW,ESE,No
18212,S,SW,SSE,No


In [14]:
X_train.drop(columns = cols_simp, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [15]:
X_train_imp = pd.concat((X_train, df_simp_tranformed), axis=1)

In [51]:
X_train_imp

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,Month,WindGustDir,WindDir9am,WindDir3pm,RainToday
2867,Albury,16.2,33.0,0.0,26.0,13.0,9.0,53.0,29.0,1018.4,1015.4,,,23.9,31.2,1,NNW,SE,NE,No
110045,Albany,14.3,22.0,0.0,,6.0,13.0,71.0,61.0,1025.4,1021.9,7.0,2.0,17.5,20.5,3,W,SW,SE,No
55289,Ballarat,2.0,9.9,0.0,33.0,11.0,13.0,91.0,94.0,1013.0,1009.2,7.0,8.0,6.6,8.1,6,N,NE,NNE,No
39713,Williamtown,5.4,17.7,0.0,48.0,15.0,28.0,63.0,43.0,1015.6,1012.0,6.0,,11.0,17.3,6,W,WNW,WNW,No
114443,Witchcliffe,12.5,23.4,0.2,43.0,20.0,26.0,59.0,48.0,1017.8,1017.1,,,19.0,22.5,12,S,ESE,SSE,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139707,Darwin,25.4,33.1,0.0,33.0,4.0,22.0,78.0,63.0,1010.2,1006.9,7.0,5.0,29.9,32.7,2,WNW,WSW,WNW,No
121459,Perth,18.2,25.0,0.0,33.0,9.0,9.0,41.0,34.0,1011.1,1009.6,7.0,7.0,19.2,24.4,9,SE,SE,SE,No
940,Albury,-1.4,14.7,0.0,17.0,2.0,9.0,92.0,52.0,1028.9,1024.7,,,3.6,14.1,7,E,NW,ESE,No
18212,NorahHead,14.3,20.8,0.0,44.0,19.0,31.0,70.0,75.0,1026.5,1024.6,,,18.6,19.5,4,S,SW,SSE,No


In [52]:
cols_encode = ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'Month']
ohe = OneHotEncoder(sparse = False, handle_unknown='ignore')
ohe_fit = ohe.fit_transform(X_train_imp[cols_encode])
ohe_df = pd.DataFrame(ohe_fit, columns = ohe.get_feature_names(cols_encode), index = X_train_imp.index)
ohe_df

Unnamed: 0,Location_Adelaide,Location_Albany,Location_Albury,Location_AliceSprings,Location_BadgerysCreek,Location_Ballarat,Location_Bendigo,Location_Brisbane,Location_Cairns,Location_Canberra,...,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Month_11,Month_12
2867,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
110045,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
55289,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
39713,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
114443,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139707,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
121459,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
940,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
18212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
X_train_int = X_train_imp.drop(cols_encode, axis=1)

In [19]:
X_train_ohe = pd.concat((X_train_int, ohe_df), axis=1)
X_train_ohe

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,...,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Month_11,Month_12
2867,16.2,33.0,0.0,26.0,13.0,9.0,53.0,29.0,1018.4,1015.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
110045,14.3,22.0,0.0,,6.0,13.0,71.0,61.0,1025.4,1021.9,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
55289,2.0,9.9,0.0,33.0,11.0,13.0,91.0,94.0,1013.0,1009.2,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
39713,5.4,17.7,0.0,48.0,15.0,28.0,63.0,43.0,1015.6,1012.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
114443,12.5,23.4,0.2,43.0,20.0,26.0,59.0,48.0,1017.8,1017.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139707,25.4,33.1,0.0,33.0,4.0,22.0,78.0,63.0,1010.2,1006.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
121459,18.2,25.0,0.0,33.0,9.0,9.0,41.0,34.0,1011.1,1009.6,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
940,-1.4,14.7,0.0,17.0,2.0,9.0,92.0,52.0,1028.9,1024.7,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
18212,14.3,20.8,0.0,44.0,19.0,31.0,70.0,75.0,1026.5,1024.6,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
imputer = KNNImputer(n_neighbors=5, weights="uniform")
imputed = imputer.fit_transform(X_train_ohe)
X_train_KNNI = pd.DataFrame(imputed, index = X_train_ohe.index, columns = X_train_ohe.columns)
X_train_KNNI.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,...,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Month_11,Month_12
2867,16.2,33.0,0.0,26.0,13.0,9.0,53.0,29.0,1018.4,1015.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
110045,14.3,22.0,0.0,40.8,6.0,13.0,71.0,61.0,1025.4,1021.9,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
55289,2.0,9.9,0.0,33.0,11.0,13.0,91.0,94.0,1013.0,1009.2,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
39713,5.4,17.7,0.0,48.0,15.0,28.0,63.0,43.0,1015.6,1012.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
114443,12.5,23.4,0.2,43.0,20.0,26.0,59.0,48.0,1017.8,1017.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [21]:
X_train_KNNI.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 93008 entries, 2867 to 142484
Columns: 125 entries, MinTemp to Month_12
dtypes: float64(125)
memory usage: 89.4 MB


In [48]:
X_train_KNNI['Cloud9am'].value_counts()

1.0    18799
7.0    14037
2.0    11470
3.0     9498
8.0     9445
6.0     7910
4.0     7335
0.0     7281
5.0     7231
9.0        2
Name: Cloud9am, dtype: int64

In [40]:
X_train_KNNI['Cloud9am'] = round(X_train_KNNI['Cloud9am'])

In [46]:
X_train_KNNI['Cloud9am'].value_counts()

1.0    18799
7.0    14037
2.0    11470
3.0     9498
8.0     9445
6.0     7910
4.0     7335
0.0     7281
5.0     7231
9.0        2
Name: Cloud9am, dtype: int64