In [51]:
for col in numerical:
    if X_train[col].isnull().mean() > 0:
        print(col, round(X_train[col].isnull().mean(),4))

MinTemp 0.0175
MaxTemp 0.0175
Rainfall 0.0305
Evaporation 0.5694
Sunshine 0.6101
WindGustSpeed 0.0735
WindSpeed9am 0.0209
WindSpeed3pm 0.038
Humidity9am 0.0232
Humidity3pm 0.0423
Pressure9am 0.1109
Pressure3pm 0.1108
Cloud9am 0.4527
Cloud3pm 0.4759
Temp9am 0.0175
Temp3pm 0.037


# Assumption

I assume that the data are missing completely at random (MCAR). There are two methods which can be used to impute missing values. One is mean or median imputation and other one is random sample imputation. When there are outliers in the dataset, we should use median imputation. So, I will use median imputation because median imputation is robust to outliers.

I will impute missing values with the appropriate statistical measures of the data, in this case median. Imputation should be done over the training set, and then propagated to the test set. It means that the statistical measures to be used to fill missing values both in train and test set, should be extracted from the train set only. This is to avoid overfitting.


In [52]:
for df1 in [X_train, X_test]:
    for col in numerical:
        col_median = X_train[col].median()
        df1[col].fillna(col_median, inplace=True)

In [53]:
X_train[numerical].isnull().sum()

MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustSpeed    0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
Year             0
dtype: int64

In [54]:
X_test[numerical].isnull().sum()

MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustSpeed    0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
Year             0
dtype: int64

# Engineering missing values in categorical variables

In [55]:
def max_value(
	df3,
 	variable,
	top
)-> None:
    return np.where(df3[variable]>top, top, df3[variable])


for df3 in [X_train, X_test]:
    df3['Rainfall'] = max_value(df3, 'Rainfall', 3.2)
    df3['Evaporation'] = max_value(df3, 'Evaporation', 21.8)
    df3['WindSpeed9am'] = max_value(df3, 'WindSpeed9am', 55)
    df3['WindSpeed3pm'] = max_value(df3, 'WindSpeed3pm', 57)    

In [56]:
X_train.Rainfall.max(), X_test.Rainfall.max()

(np.float64(3.2), np.float64(3.2))

In [57]:
X_train.Evaporation.max(), X_test.Evaporation.max()

(np.float64(21.8), np.float64(21.8))

In [58]:
X_train.Evaporation.max(), X_test.Evaporation.max()

(np.float64(21.8), np.float64(21.8))

In [59]:
X_train.WindSpeed9am.max(), X_test.WindSpeed9am.max()


(np.float64(55.0), np.float64(55.0))

In [60]:
X_train.WindSpeed3pm.max(), X_test.WindSpeed3pm.max()

(np.float64(57.0), np.float64(57.0))

In [61]:


X_train[numerical].describe()



Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,Year
count,207926.0,207926.0,207926.0,207926.0,207926.0,207926.0,207926.0,207926.0,207926.0,207926.0,207926.0,207926.0,207926.0,207926.0,207926.0,207926.0,207926.0
mean,12.111055,23.256256,0.654489,5.074938,8.160694,39.897247,14.004213,18.465646,69.103066,51.486514,1017.701934,1015.275418,5.313126,4.848499,16.950539,21.730975,2016.639949
std,6.370066,7.058832,1.171529,2.429626,2.396442,12.981801,8.725112,8.481789,19.152669,20.551627,6.678939,6.63599,2.232251,2.006459,6.468363,6.852871,4.828522
min,-8.7,-4.1,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,978.6,972.0,0.0,0.0,-6.0,-5.1,2007.0
25%,7.6,18.1,0.0,4.8,8.5,31.0,7.0,13.0,57.0,37.0,1013.6,1011.1,5.0,5.0,12.3,16.9,2012.0
50%,11.9,22.7,0.0,4.8,8.5,39.0,13.0,17.0,70.0,52.0,1017.6,1015.2,6.0,5.0,16.7,21.2,2017.0
75%,16.7,28.1,0.6,4.8,8.5,46.0,19.0,24.0,83.0,65.0,1021.8,1019.4,6.0,6.0,21.4,26.2,2021.0
max,33.9,48.9,3.2,21.8,14.4,154.0,55.0,57.0,100.0,100.0,1042.5,1040.3,8.0,9.0,39.4,48.2,2025.0


We can now see that the outliers in Rainfall, Evaporation, WindSpeed9am and WindSpeed3pm columns are capped.

# Encode categorical variables

In [62]:
categorical

['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']

In [63]:
X_train[categorical].head()

Unnamed: 0,Location,WindGustDir,WindDir9am,WindDir3pm,RainToday
132639,Nhil,ENE,NE,SSE,Yes
163451,GoldCoast,,SE,E,No
60503,SydneyAirport,SSE,SSE,SSW,Yes
101669,Ballarat,NW,S,NW,No
248628,Darwin,NW,ENE,N,No


In [64]:
import category_encoders as ce

encoder =  ce.BinaryEncoder(cols=['RainToday'])

X_train = encoder.fit_transform(X_train)

X_test = encoder.transform(X_test)
 

In [65]:
print(f'This is a encode category {X_train[:4]}')

This is a encode category              Location  MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  \
132639           Nhil      6.7     12.8       3.2          4.8       8.5   
163451      GoldCoast     21.7     22.7       0.0          4.8       8.5   
60503   SydneyAirport     11.6     15.6       3.2          4.8       0.0   
101669       Ballarat      9.0     30.0       0.0          4.8       8.5   

       WindGustDir  WindGustSpeed WindDir9am WindDir3pm  ...  Humidity3pm  \
132639         ENE           54.0         NE        SSE  ...         79.0   
163451         NaN           39.0         SE          E  ...         72.0   
60503          SSE           89.0        SSE        SSW  ...         96.0   
101669          NW           35.0          S         NW  ...         26.0   

        Pressure9am  Pressure3pm  Cloud9am  Cloud3pm  Temp9am  Temp3pm  \
132639        987.5        987.7       6.0       5.0      8.8     11.6   
163451       1013.8       1011.1       6.0       5.0     27

In [66]:
X_train.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday_0,RainToday_1,Year
132639,Nhil,6.7,12.8,3.2,4.8,8.5,ENE,54.0,NE,SSE,...,79.0,987.5,987.7,6.0,5.0,8.8,11.6,0,1,2022
163451,GoldCoast,21.7,22.7,0.0,4.8,8.5,,39.0,SE,E,...,72.0,1013.8,1011.1,6.0,5.0,27.6,28.3,1,0,2016
60503,SydneyAirport,11.6,15.6,3.2,4.8,0.0,SSE,89.0,SSE,SSW,...,96.0,1001.9,1004.9,8.0,8.0,13.7,13.1,0,1,2014
101669,Ballarat,9.0,30.0,0.0,4.8,8.5,NW,35.0,S,NW,...,26.0,1013.5,1010.3,7.0,8.0,14.7,28.4,1,0,2023
248628,Darwin,23.4,33.3,0.0,6.2,10.9,NW,39.0,ENE,N,...,40.0,1014.6,1009.5,3.0,2.0,28.4,31.6,1,0,2014


We can see that two additional variables RainToday_0 and RainToday_1 are created from RainToday variable.

Now, I will create the X_train training s

In [67]:


X_train = pd.concat([X_train[numerical], X_train[['RainToday_0', 'RainToday_1']],
			pd.get_dummies(X_train.Location), 
			pd.get_dummies(X_train.WindGustDir),
			pd.get_dummies(X_train.WindDir9am),
			pd.get_dummies(X_train.WindDir3pm)], axis=1

)

In [68]:
X_train.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,NNW,NW,S,SE,SSE,SSW,SW,W,WNW,WSW
132639,6.7,12.8,3.2,4.8,8.5,54.0,13.0,24.0,96.0,79.0,...,False,False,False,False,True,False,False,False,False,False
163451,21.7,22.7,0.0,4.8,8.5,39.0,7.0,24.0,80.0,72.0,...,False,False,False,False,False,False,False,False,False,False
60503,11.6,15.6,3.2,4.8,0.0,89.0,55.0,52.0,99.0,96.0,...,False,False,False,False,False,True,False,False,False,False
101669,9.0,30.0,0.0,4.8,8.5,35.0,9.0,17.0,91.0,26.0,...,False,True,False,False,False,False,False,False,False,False
248628,23.4,33.3,0.0,6.2,10.9,39.0,13.0,28.0,60.0,40.0,...,False,False,False,False,False,False,False,False,False,False




Similarly, I will create the X_test testing set.


In [69]:
X_test = pd.concat([X_test[numerical], X_test[['RainToday_0', 'RainToday_1']],
                     pd.get_dummies(X_test.Location), 
                     pd.get_dummies(X_test.WindGustDir),
                     pd.get_dummies(X_test.WindDir9am),
                     pd.get_dummies(X_test.WindDir3pm)], axis=1)



In [70]:
X_test.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,NNW,NW,S,SE,SSE,SSW,SW,W,WNW,WSW
119911,13.8,19.9,0.0,5.6,3.3,46.0,17.0,26.0,62.0,53.0,...,False,False,False,False,True,False,False,False,False,False
10754,17.2,25.1,1.4,4.8,8.5,24.0,11.0,15.0,78.0,75.0,...,False,False,False,False,False,False,False,False,False,False
28704,8.0,21.7,0.0,4.8,8.5,39.0,19.0,17.0,49.0,52.0,...,False,False,False,False,False,False,False,False,False,False
9739,19.4,31.7,0.2,4.8,8.5,39.0,7.0,19.0,54.0,57.0,...,False,False,False,False,False,False,False,False,False,False
247855,16.0,30.2,0.0,5.6,10.7,28.0,15.0,15.0,61.0,32.0,...,True,False,False,False,False,False,False,False,False,False


# Feature Scaling

In [71]:
X_train.describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,Year,RainToday_0,RainToday_1
count,207926.0,207926.0,207926.0,207926.0,207926.0,207926.0,207926.0,207926.0,207926.0,207926.0,207926.0,207926.0,207926.0,207926.0,207926.0,207926.0,207926.0,207926.0,207926.0
mean,12.111055,23.256256,0.654489,5.074938,8.160694,39.897247,14.004213,18.465646,69.103066,51.486514,1017.701934,1015.275418,5.313126,4.848499,16.950539,21.730975,2016.639949,0.785958,0.24452
std,6.370066,7.058832,1.171529,2.429626,2.396442,12.981801,8.725112,8.481789,19.152669,20.551627,6.678939,6.63599,2.232251,2.006459,6.468363,6.852871,4.828522,0.410157,0.429803
min,-8.7,-4.1,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,978.6,972.0,0.0,0.0,-6.0,-5.1,2007.0,0.0,0.0
25%,7.6,18.1,0.0,4.8,8.5,31.0,7.0,13.0,57.0,37.0,1013.6,1011.1,5.0,5.0,12.3,16.9,2012.0,1.0,0.0
50%,11.9,22.7,0.0,4.8,8.5,39.0,13.0,17.0,70.0,52.0,1017.6,1015.2,6.0,5.0,16.7,21.2,2017.0,1.0,0.0
75%,16.7,28.1,0.6,4.8,8.5,46.0,19.0,24.0,83.0,65.0,1021.8,1019.4,6.0,6.0,21.4,26.2,2021.0,1.0,0.0
max,33.9,48.9,3.2,21.8,14.4,154.0,55.0,57.0,100.0,100.0,1042.5,1040.3,8.0,9.0,39.4,48.2,2025.0,1.0,1.0


In [72]:
cols = X_train.columns
cols

Index(['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
       'Humidity3pm',
       ...
       'NNW', 'NW', 'S', 'SE', 'SSE', 'SSW', 'SW', 'W', 'WNW', 'WSW'],
      dtype='object', length=116)

In [74]:
print(f'This is a scale data train {X_train}\n this is a scale data test{X_test}')

This is a scale data train [[0.36150235 0.31886792 1.         ... 0.         0.         0.        ]
 [0.71361502 0.50566038 0.         ... 0.         0.         0.        ]
 [0.47652582 0.37169811 1.         ... 0.         0.         0.        ]
 ...
 [0.6713615  0.53018868 0.         ... 0.         0.         0.        ]
 [0.33568075 0.50566038 0.         ... 0.         0.         0.        ]
 [0.51643192 0.4245283  1.         ... 0.         0.         0.        ]]
 this is a scale data test[[0.52816901 0.45283019 0.         ... 0.         0.         0.        ]
 [0.60798122 0.5509434  0.4375     ... 0.         0.         0.        ]
 [0.39201878 0.48679245 0.         ... 0.         0.         0.        ]
 ...
 [0.40375587 0.33962264 1.         ... 0.         0.         0.        ]
 [0.47887324 0.60943396 0.         ... 0.         0.         0.        ]
 [0.33333333 0.33396226 0.0625     ... 0.         0.         1.        ]]
