In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [35]:
data = pd.read_csv('weatherAUS.csv')
data.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [36]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

## RainTomorrow is the target

In [37]:
data.isnull().sum() /len(data)  * 100

Date              0.000000
Location          0.000000
MinTemp           1.020899
MaxTemp           0.866905
Rainfall          2.241853
Evaporation      43.166506
Sunshine         48.009762
WindGustDir       7.098859
WindGustSpeed     7.055548
WindDir9am        7.263853
WindDir3pm        2.906641
WindSpeed9am      1.214767
WindSpeed3pm      2.105046
Humidity9am       1.824557
Humidity3pm       3.098446
Pressure9am      10.356799
Pressure3pm      10.331363
Cloud9am         38.421559
Cloud3pm         40.807095
Temp9am           1.214767
Temp3pm           2.481094
RainToday         2.241853
RainTomorrow      2.245978
dtype: float64

In [38]:
data = data.drop(['Location','Date','Evaporation','Sunshine', 'Cloud9am','Cloud3pm',
                           'WindGustDir','WindGustSpeed','WindDir9am','WindDir3pm','WindSpeed9am',
                           'WindSpeed3pm'], axis=1)
data.isnull().sum() /len(data) * 100

MinTemp          1.020899
MaxTemp          0.866905
Rainfall         2.241853
Humidity9am      1.824557
Humidity3pm      3.098446
Pressure9am     10.356799
Pressure3pm     10.331363
Temp9am          1.214767
Temp3pm          2.481094
RainToday        2.241853
RainTomorrow     2.245978
dtype: float64

In [39]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   MinTemp       143975 non-null  float64
 1   MaxTemp       144199 non-null  float64
 2   Rainfall      142199 non-null  float64
 3   Humidity9am   142806 non-null  float64
 4   Humidity3pm   140953 non-null  float64
 5   Pressure9am   130395 non-null  float64
 6   Pressure3pm   130432 non-null  float64
 7   Temp9am       143693 non-null  float64
 8   Temp3pm       141851 non-null  float64
 9   RainToday     142199 non-null  object 
 10  RainTomorrow  142193 non-null  object 
dtypes: float64(9), object(2)
memory usage: 12.2+ MB


## Seperate X and y

In [40]:
y = data['RainTomorrow']
X = data.drop(['RainTomorrow'], axis=1)

In [41]:
## Missing Value Treatment

# Switching 'Yes' and 'No' with a boolen value and handling NaN values, in this case replacing it with a zero
X = X.replace({'No':0, 'Yes':1})
X = X.fillna(0)
y = y.replace({'No':0, 'Yes':1})
y = y.fillna(0)

## Feature Scaling

- As we see the data present in the cols is at different scale we try to bring the complete data to a single scale i.e. from 0 to 1 using MinMaxScaler

In [42]:
# Initializing the MinMaxScaler function
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()

In [43]:
X_scaled = pd.DataFrame(mms.fit_transform(X), columns = X.columns)
X_scaled.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday
0,0.516509,0.523629,0.001617,0.71,0.22,0.968012,0.968738,0.508439,0.522073,0.0
1,0.375,0.565217,0.0,0.44,0.25,0.970797,0.969411,0.514768,0.570058,0.0
2,0.504717,0.57656,0.0,0.38,0.3,0.967915,0.970277,0.594937,0.548944,0.0
3,0.417453,0.620038,0.0,0.45,0.16,0.977522,0.974221,0.533755,0.612284,0.0
4,0.613208,0.701323,0.002695,0.82,0.33,0.970989,0.96768,0.527426,0.673704,0.0


In [44]:
# Splitting  up data, seting 75% for train and 25% for test.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=43)

## Feature Extraction / Feature Selection Techniques:

1. Univariate Feature Selection using SelectKbest Method

In [45]:
X_train

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday
94159,0.485849,0.586011,0.000000,0.63,0.50,0.979443,0.977107,0.592827,0.581574,0.0
2402,0.301887,0.493384,0.000000,0.71,0.50,0.985014,0.981916,0.436709,0.500960,0.0
20350,0.540094,0.468809,0.007547,0.99,0.87,0.980596,0.976914,0.485232,0.468330,1.0
122261,0.653302,0.748582,0.000000,0.58,0.28,0.975312,0.972489,0.643460,0.750480,0.0
75351,0.363208,0.512287,0.000539,0.63,0.53,0.978770,0.978165,0.508439,0.516315,0.0
...,...,...,...,...,...,...,...,...,...,...
57651,0.455189,0.646503,0.000000,0.72,0.21,0.982805,0.982493,0.466245,0.646833,0.0
129082,0.367925,0.449905,0.000000,0.69,0.51,0.977714,0.974413,0.415612,0.449136,0.0
125205,0.424528,0.699433,0.000000,0.27,0.13,0.000000,0.000000,0.666667,0.692898,0.0
40753,0.469340,0.455577,0.000000,0.94,0.55,0.993564,0.992786,0.445148,0.464491,0.0


In [46]:
# Initialize SelectKBest function
from sklearn.feature_selection import SelectKBest 
from sklearn.feature_selection import chi2
UnivariateFeatureSelection = SelectKBest(chi2, k=5).fit(X_train, y_train)

In [47]:
# Creating a dict to visualize which features were selected with the highest score
diccionario = {key:value for (key, value) in zip(UnivariateFeatureSelection.scores_, X_train.columns)}
sorted(diccionario.items())

[(0.3872193415815613, 'Pressure3pm'),
 (0.41540272061356587, 'Pressure9am'),
 (1.9060887930402912, 'Temp9am'),
 (37.94990744481331, 'MinTemp'),
 (78.29598368880437, 'MaxTemp'),
 (128.20122465797363, 'Temp3pm'),
 (372.3268750685895, 'Humidity9am'),
 (484.2182821838668, 'Rainfall'),
 (1722.8468322330248, 'Humidity3pm'),
 (7864.236399930817, 'RainToday')]

As we can see, the last five elements have the highest score. So the best features are:

1. RainToday
2. Temp3pm
3. Humidity3pm
4. Rainfall
5. Humidity9am   
Now that we have the best features, let's extract them from the original data set and let's measure the performance with the random forest algorithm.

In [23]:
X_train = X_train[['RainToday', 'Temp3pm', 'Humidity3pm', 'Rainfall', 'Humidity9am']]
X_test = X_test[['RainToday', 'Temp3pm', 'Humidity3pm', 'Rainfall', 'Humidity9am']]

In [24]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc

In [25]:
rfc.fit(X_train, y_train)

In [26]:
y_pred = rfc.predict(X_test)

In [27]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred, y_test)

0.8131720060497731

In [28]:
from sklearn.model_selection import StratifiedKFold,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

LRM = LogisticRegression()
DTC = DecisionTreeClassifier()
RFC = RandomForestClassifier()
KNC = KNeighborsClassifier()
NBC = GaussianNB()
SVC = SVC()

In [29]:
SKF = StratifiedKFold(n_splits = 10, shuffle =True, random_state=10)

In [32]:
print(f'LogisticRegression : {round(cross_val_score(LRM,X,y,cv=SKF,scoring="accuracy").mean()*100,2)}%')
print(f'DecisionTreeClassifier : {round(cross_val_score(DTC,X,y,cv=SKF,scoring="accuracy").mean()*100,2)}%')
print(f'RandomForestClassifier : {round(cross_val_score(RFC,X,y,cv=SKF,scoring="accuracy").mean()*100,2)}%')
print(f'KNeighborsClassifier : {round(cross_val_score(KNC,X,y,cv=SKF,scoring="accuracy").mean()*100,2)}%')
print(f'GaussianNB : {round(cross_val_score(NBC,X,y,cv=SKF,scoring="accuracy").mean()*100,2)}%')
print(f'Support Vector Machine : {round(cross_val_score(SVC,X,y,cv=SKF,scoring="accuracy").mean()*100,2)}%')

LogisticRegression : 82.73%
DecisionTreeClassifier : 75.25%


KeyboardInterrupt: 

In [30]:
X.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday
0,13.4,22.9,0.6,71.0,22.0,1007.7,1007.1,16.9,21.8,0.0
1,7.4,25.1,0.0,44.0,25.0,1010.6,1007.8,17.2,24.3,0.0
2,12.9,25.7,0.0,38.0,30.0,1007.6,1008.7,21.0,23.2,0.0
3,9.2,28.0,0.0,45.0,16.0,1017.6,1012.8,18.1,26.5,0.0
4,17.5,32.3,1.0,82.0,33.0,1010.8,1006.0,17.8,29.7,0.0


In [31]:
X = X[['RainToday', 'Temp3pm', 'Humidity3pm', 'Rainfall', 'Humidity9am']]
X.head()

Unnamed: 0,RainToday,Temp3pm,Humidity3pm,Rainfall,Humidity9am
0,0.0,21.8,22.0,0.6,71.0
1,0.0,24.3,25.0,0.0,44.0
2,0.0,23.2,30.0,0.0,38.0
3,0.0,26.5,16.0,0.0,45.0
4,0.0,29.7,33.0,1.0,82.0


In [48]:
# Using the 'UnivariateFeatureSelection' based on 'SelectKBest' function,
# let's extract the best features from the original dataset

X_train_k_best = UnivariateFeatureSelection.transform(X_train)
X_test_k_best = UnivariateFeatureSelection.transform(X_test)

In [49]:
X_train_k_best

array([[0.        , 0.63      , 0.5       , 0.5815739 , 0.        ],
       [0.        , 0.71      , 0.5       , 0.50095969, 0.        ],
       [0.00754717, 0.99      , 0.87      , 0.46833013, 1.        ],
       ...,
       [0.        , 0.27      , 0.13      , 0.69289827, 0.        ],
       [0.        , 0.94      , 0.55      , 0.46449136, 0.        ],
       [0.02210243, 0.68      , 0.24      , 0.82341651, 1.        ]])

In [51]:
print("Shape of original data: ", X_train.shape)
print("Shape of data with best features: ", X_train_k_best.shape)

Shape of original data:  (109095, 10)
Shape of data with best features:  (109095, 5)


In [53]:
# Initializing and fitting data to the random forest classifier
RandForest_K_best = RandomForestClassifier()      
RandForest_K_best = RandForest_K_best.fit(X_train_k_best, y_train)

In [54]:
# Making a prediction and calculting the accuracy
y_pred = RandForest_K_best.predict(X_test_k_best)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: ',accuracy)

Accuracy:  0.8132545029561391


## Recursive Feature Elimination:

- The idea of this method is to make use of an estimator (in this case we are using random forest), and test with different sizes of features until find the best set of features.

In [56]:
from sklearn.feature_selection import RFE

In [58]:
# Initializing Random Forest Classifier
RandForest_RFE = RandomForestClassifier() 
# Initializing the RFE object, one of the most important arguments is the estimator, in this case is RandomForest
rfe = RFE(estimator=RandForest_RFE, n_features_to_select=5, step=1)
# Fit the origial dataset
rfe = rfe.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
print("Best features chosen by RFE: \n")
for i in x_train.columns[rfe.support_]:
    print(i)