In [1]:
from collections import Counter
from pathlib import Path
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import os
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Set folder paths for outputs

In [2]:
import sys
sys.path.append("../../..")

In [3]:
from utils.auxSamplingStudy import *

In [4]:
from analysis.data.rain.config import DATA, MODELS, REPORTS, idbin, idcat, idnum

# Read the file

In [5]:
df= pd.read_csv(DATA / 'rain.csv')

In [6]:
df.columns

Index(['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RISK_MM', 'RainTomorrow'],
      dtype='object')

#### As we can see Sunshine, Evaporation, Cloud3pm and Cloud9am have around 50% data, we can ignore these four columns We need to remove RISK_MM because we want to predict 'RainTomorrow' and RISK_MM can leak some info to our model We can remove location and date column too, because we are going to findwhich weather conditions affect chancess of rain

In [7]:
df.drop(columns=['Sunshine','Evaporation','Cloud3pm','Cloud9am','RISK_MM','Date','Location'],axis=1, inplace = True)

#### In order to feed the data to any machine learning method, it's convenient to change strings to numeric values. So, we are going to change 'No' to 0 and 'Yes' to 1

In [8]:
df['RainToday'].replace({'No': 0, 'Yes': 1},inplace = True)
df['RainTomorrow'].replace({'No': 0, 'Yes': 1},inplace = True)

In [9]:
df.drop_duplicates(inplace=True)

In [10]:
df.isnull().sum()

MinTemp            586
MaxTemp            277
Rainfall          1402
WindGustDir       9279
WindGustSpeed     9219
WindDir9am        9962
WindDir3pm        3727
WindSpeed9am      1297
WindSpeed3pm      2579
Humidity9am       1723
Humidity3pm       3559
Pressure9am      13963
Pressure3pm      13930
Temp9am            853
Temp3pm           2675
RainToday         1402
RainTomorrow         0
dtype: int64

In [11]:
df.shape

(142142, 17)

In [12]:
df.dropna(inplace=True)

In [13]:
df.shape

(112925, 17)

In [14]:
Counter(df.RainTomorrow)

Counter({0: 87906, 1: 25019})

In [15]:
df.columns

Index(['MinTemp', 'MaxTemp', 'Rainfall', 'WindGustDir', 'WindGustSpeed',
       'WindDir9am', 'WindDir3pm', 'WindSpeed9am', 'WindSpeed3pm',
       'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RainTomorrow'],
      dtype='object')

# Create helper functions

In [16]:
# df_feature= df.drop('RainTomorrow',1)
df_target= df[['RainTomorrow']]
df_target = df_target.rename(columns={'RainTomorrow': 'target'})

In [17]:
encoded_df = MultiColumnLabelEncoder(columns = ['WindGustDir', 'WindDir9am', 'WindDir3pm']).fit_transform(df)


In [18]:
anovap_value = list()
for cols in ['WindGustDir', 'WindDir9am', 'WindDir3pm']:
    statistic, p = stats.f_oneway(encoded_df[cols], encoded_df['RainTomorrow'])
    anovap_value.append(tuple([cols, p]))
    
print(anovap_value)

[('WindGustDir', 0.0), ('WindDir9am', 0.0), ('WindDir3pm', 0.0)]


In [19]:
from scipy.stats import chi2_contingency
chi2p_value = list()
for cols in ['WindGustDir', 'WindDir9am', 'WindDir3pm']:
    obs = pd.crosstab(encoded_df[cols], encoded_df['RainTomorrow'])
    g, p, dof, expctd = chi2_contingency(obs)
    chi2p_value.append(tuple([cols, p]))
    
print(chi2p_value)

[('WindGustDir', 1.485853343880933e-292), ('WindDir9am', 0.0), ('WindDir3pm', 1.3203990219430103e-251)]


In [20]:
encoded_df= encoded_df.drop('RainTomorrow',1)

In [21]:
1 - 2000 / encoded_df.shape[0]

0.9822891299535089

In [22]:
np.sum(df_target)/df_target.shape[0]

target    0.221554
dtype: float64

In [23]:
#Sampling: Added by me
encoded_df, aux1, df_target, aux2 = train_test_split(encoded_df, df_target, test_size=0.98, random_state=12)

In [24]:
df_target.shape

(2258, 1)

In [25]:
np.sum(df_target)

target    477
dtype: int64

In [26]:
np.sum(df_target)/df_target.shape[0]

target    0.211249
dtype: float64

In [27]:
1/5

0.2

In [28]:
# Split the raw data into train and test set. Split ratio = 75:25

X_train, X_test, y_train, y_test = train_test_split(encoded_df, df_target, test_size=0.25, random_state=12)

In [29]:
col_list = list(X_train.columns)
X_train.index = pd.RangeIndex(len(X_train.index))
y_train.index = pd.RangeIndex(len(y_train.index))
X_test.index = pd.RangeIndex(len(X_test.index))
y_test.index = pd.RangeIndex(len(y_test.index))

In [30]:
X_train.shape

(1693, 16)

In [31]:
## Apply standard scaler on the features , so that euclidean distance calculation in SMOTE is not biased

name_columns = X_train.columns[idnum]

sc = StandardScaler()
X_train[name_columns] = sc.fit_transform(X_train[name_columns])
X_test[name_columns] = sc.transform(X_test[name_columns])

In [32]:
X_train = pd.DataFrame(X_train, index=range(X_train.shape[0]),
                          columns=col_list)
X_test = pd.DataFrame(X_test, index=range(X_test.shape[0]),
                          columns=col_list)

In [33]:
X_train.to_csv(DATA / "rain_X_train.csv", index = False)
y_train.to_csv(DATA / "rain_y_train.csv", index = False)
X_test.to_csv(DATA / "rain_X_test.csv", index = False)
y_test.to_csv(DATA / "rain_y_test.csv", index = False)