In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif, mutual_info_classif
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

In [4]:
df = pd.read_csv('final_df.csv')
pd.set_option("display.max_columns", None)

Determining the amount of days when it rained and when it didn't

In [5]:
df['Rain Tomorrow'].value_counts()

0    106894
1     30168
Name: Rain Tomorrow, dtype: int64

Percentage of the days when it rained and when it didn't

In [6]:
df['Rain Tomorrow'].value_counts()/len(df['Rain Tomorrow'])*100

0    77.989523
1    22.010477
Name: Rain Tomorrow, dtype: float64

It can be seen that there are much more instances where it **did not rain**

Determining the amount of instances that should be dropped in order to get the same number of days where it rained and days where it did not rain. This way the models won't be biased into concluding that most of the time it doesn't rain. Also, the objective is to have a model that is efficient at predicting when it will rain.

In [7]:
# Define the feature matrix (X) and the target vector (y)
X = df.drop('Rain Tomorrow', axis=1)
y = df['Rain Tomorrow']

# Define the undersampler
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Convert the resampled data back to a DataFrame
df_resampled = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=['Rain Tomorrow'])], axis=1)

# Print the original class distribution
print(f'Original dataset shape: {Counter(y)}')
# Check the class distribution after resampling
print("\nResampled class distribution:")
print(df_resampled['Rain Tomorrow'].value_counts())

Original dataset shape: Counter({0: 106894, 1: 30168})

Resampled class distribution:
0    30168
1    30168
Name: Rain Tomorrow, dtype: int64


In [8]:
df_resampled.head(10)

Unnamed: 0,Year,Month,Day,Center,North,South,MinTemp,MaxTemp,Rainfall,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,Rain Today,Rain Tomorrow
96734,2009,12,11,0,0,1,5.9,20.0,5.4,8.388787,39.0,13.0,22.0,52.0,31.0,3.0,2.0,13.8,18.9,1,0
22706,2011,5,28,0,0,1,4.0,20.0,0.2,5.923032,17.0,2.0,11.0,98.0,58.0,6.0,5.0,9.1,17.8,0,0
129391,2013,3,26,1,0,0,20.3,38.4,3.6,6.5,39.0,7.0,20.0,50.0,18.0,7.0,6.0,24.5,37.4,1,0
117691,2013,11,5,0,0,1,19.4,21.6,0.0,8.275429,43.0,20.0,30.0,69.0,63.0,5.0,5.0,21.2,19.9,0,0
109843,2017,4,17,0,0,1,11.4,22.6,0.0,2.5,28.0,19.0,19.0,73.0,57.0,4.0,8.0,18.1,21.7,0,0
49547,2014,1,9,0,0,1,5.1,14.4,0.0,5.257155,41.0,13.0,7.0,98.0,77.0,7.0,7.0,6.4,12.9,0,0
99756,2010,2,8,0,0,1,25.7,39.7,0.0,13.0,50.0,15.0,19.0,26.0,18.0,1.0,4.0,31.2,37.9,0,0
52174,2012,10,29,0,0,1,4.5,23.8,0.2,8.696365,35.0,15.0,17.0,63.0,37.0,4.0,4.0,16.9,22.9,0,0
39175,2017,5,1,0,0,1,8.5,23.8,0.0,6.212512,19.0,11.0,9.0,100.0,54.0,6.0,5.0,13.8,23.4,0,0
55387,2013,7,5,0,0,1,4.7,11.9,5.0,3.98555,54.0,22.0,30.0,80.0,57.0,7.0,8.0,6.3,10.8,1,0


In [9]:
df= df_resampled.reset_index().drop('index',axis=1)
df

Unnamed: 0,Year,Month,Day,Center,North,South,MinTemp,MaxTemp,Rainfall,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,Rain Today,Rain Tomorrow
0,2009,12,11,0,0,1,5.9,20.0,5.4,8.388787,39.0,13.0,22.0,52.0,31.0,3.0,2.0,13.8,18.9,1,0
1,2011,5,28,0,0,1,4.0,20.0,0.2,5.923032,17.0,2.0,11.0,98.0,58.0,6.0,5.0,9.1,17.8,0,0
2,2013,3,26,1,0,0,20.3,38.4,3.6,6.500000,39.0,7.0,20.0,50.0,18.0,7.0,6.0,24.5,37.4,1,0
3,2013,11,5,0,0,1,19.4,21.6,0.0,8.275429,43.0,20.0,30.0,69.0,63.0,5.0,5.0,21.2,19.9,0,0
4,2017,4,17,0,0,1,11.4,22.6,0.0,2.500000,28.0,19.0,19.0,73.0,57.0,4.0,8.0,18.1,21.7,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60331,2017,2,10,1,0,0,24.8,39.8,0.4,9.703550,65.0,15.0,20.0,38.0,24.0,2.0,3.0,31.5,36.6,0,1
60332,2017,4,17,1,0,0,19.3,24.4,0.0,3.985550,35.0,7.0,19.0,28.0,91.0,8.0,8.0,21.3,18.5,0,1
60333,2017,4,18,1,0,0,15.2,21.5,6.8,3.985550,30.0,19.0,9.0,65.0,65.0,3.0,8.0,19.0,21.2,1,1
60334,2017,4,19,1,0,0,17.7,26.9,12.6,3.985550,35.0,15.0,20.0,93.0,59.0,7.0,8.0,19.0,26.0,1,1


The f-classif is based on the Analysis of Variance (ANOVA) F-test, which assesses whether there are **significant differences between the means of two or more groups**.

It is done by calculating the F-statistic for each feature, which is the **ratio of the variance between the groups (classes) to the variance within the groups**. A higher F-statistic indicates that the feature is more discriminative.


In [10]:
X = df.iloc[:,0:20] 
y = df.iloc[:,-1]    

bestfeatures = SelectKBest(score_func=f_classif, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
featureScores_f = pd.concat([dfcolumns,dfscores],axis=1)
featureScores_f.columns = ['Features','Score']
featureScores_f = featureScores_f.sort_values(by='Score', ascending=False)
featureScores_f

Unnamed: 0,Features,Score
14,Humidity3pm,21301.558666
16,Cloud3pm,16870.845966
9,Sunshine,14148.061651
15,Cloud9am,11142.726356
19,Rain Today,7674.785359
13,Humidity9am,6569.272941
10,WindGustSpeed,3964.880238
8,Rainfall,3218.469527
18,Temp3pm,3177.251309
7,MaxTemp,2096.631945


Mutual information measures the **dependency between features and the target variable**

It quantifies the **amount of information obtained about one variable through another variable**. Higher mutual information indicates a higher dependency.


In [11]:
bestfeatures = SelectKBest(score_func=mutual_info_classif, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
featureScores_mi = pd.concat([dfcolumns,dfscores],axis=1)
featureScores_mi.columns = ['Features','Score']
featureScores_mi = featureScores_mi.sort_values(by='Score', ascending=False)
featureScores_mi

Unnamed: 0,Features,Score
14,Humidity3pm,0.155498
9,Sunshine,0.126033
16,Cloud3pm,0.122129
15,Cloud9am,0.08763
8,Rainfall,0.074215
19,Rain Today,0.060361
13,Humidity9am,0.055572
10,WindGustSpeed,0.037855
18,Temp3pm,0.026362
7,MaxTemp,0.021967


Based on both feature selection methods we conclude that the 11 most important variables are the same in both 'MinTemp', 'MaxTemp', 'Rainfall', 'Sunshine', 'WindGustSpeed', 'Humidity9am', 'Humidity3pm', 'Cloud9am', 'Cloud3pm', 'Temp3pm', 'Rain Today'. The f-classif method deems MinTemp as unimportant but the mutual_info has a considerable score for it, so it will be used 

In [12]:
df = df[['MinTemp', 'MaxTemp', 'Rainfall', 'Sunshine', 'WindGustSpeed', 'Humidity9am', 'Humidity3pm', 'Cloud9am', 'Cloud3pm', 'Temp3pm', 'Rain Today', 'Rain Tomorrow']]
df

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Sunshine,WindGustSpeed,Humidity9am,Humidity3pm,Cloud9am,Cloud3pm,Temp3pm,Rain Today,Rain Tomorrow
0,5.9,20.0,5.4,8.388787,39.0,52.0,31.0,3.0,2.0,18.9,1,0
1,4.0,20.0,0.2,5.923032,17.0,98.0,58.0,6.0,5.0,17.8,0,0
2,20.3,38.4,3.6,6.500000,39.0,50.0,18.0,7.0,6.0,37.4,1,0
3,19.4,21.6,0.0,8.275429,43.0,69.0,63.0,5.0,5.0,19.9,0,0
4,11.4,22.6,0.0,2.500000,28.0,73.0,57.0,4.0,8.0,21.7,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
60331,24.8,39.8,0.4,9.703550,65.0,38.0,24.0,2.0,3.0,36.6,0,1
60332,19.3,24.4,0.0,3.985550,35.0,28.0,91.0,8.0,8.0,18.5,0,1
60333,15.2,21.5,6.8,3.985550,30.0,65.0,65.0,3.0,8.0,21.2,1,1
60334,17.7,26.9,12.6,3.985550,35.0,93.0,59.0,7.0,8.0,26.0,1,1


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60336 entries, 0 to 60335
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MinTemp        60336 non-null  float64
 1   MaxTemp        60336 non-null  float64
 2   Rainfall       60336 non-null  float64
 3   Sunshine       60336 non-null  float64
 4   WindGustSpeed  60336 non-null  float64
 5   Humidity9am    60336 non-null  float64
 6   Humidity3pm    60336 non-null  float64
 7   Cloud9am       60336 non-null  float64
 8   Cloud3pm       60336 non-null  float64
 9   Temp3pm        60336 non-null  float64
 10  Rain Today     60336 non-null  int64  
 11  Rain Tomorrow  60336 non-null  int64  
dtypes: float64(10), int64(2)
memory usage: 5.5 MB


In [14]:
df.to_csv('df_feature_engineering.csv',index=False)