# Import data

In [47]:
import pandas as pd 

In [48]:
data = pd.read_csv('weatherAUS.csv')

In [49]:
print(data)

              Date Location  MinTemp  MaxTemp  Rainfall  Evaporation  \
0       2008-12-01   Albury     13.4     22.9       0.6          NaN   
1       2008-12-02   Albury      7.4     25.1       0.0          NaN   
2       2008-12-03   Albury     12.9     25.7       0.0          NaN   
3       2008-12-04   Albury      9.2     28.0       0.0          NaN   
4       2008-12-05   Albury     17.5     32.3       1.0          NaN   
...            ...      ...      ...      ...       ...          ...   
142188  2017-06-20    Uluru      3.5     21.8       0.0          NaN   
142189  2017-06-21    Uluru      2.8     23.4       0.0          NaN   
142190  2017-06-22    Uluru      3.6     25.3       0.0          NaN   
142191  2017-06-23    Uluru      5.4     26.9       0.0          NaN   
142192  2017-06-24    Uluru      7.8     27.0       0.0          NaN   

        Sunshine WindGustDir  WindGustSpeed WindDir9am  ... Humidity3pm  \
0            NaN           W           44.0          W  ... 

In [50]:
X = data.iloc[:,:-1]

In [51]:
print(X)

              Date Location  MinTemp  MaxTemp  Rainfall  Evaporation  \
0       2008-12-01   Albury     13.4     22.9       0.6          NaN   
1       2008-12-02   Albury      7.4     25.1       0.0          NaN   
2       2008-12-03   Albury     12.9     25.7       0.0          NaN   
3       2008-12-04   Albury      9.2     28.0       0.0          NaN   
4       2008-12-05   Albury     17.5     32.3       1.0          NaN   
...            ...      ...      ...      ...       ...          ...   
142188  2017-06-20    Uluru      3.5     21.8       0.0          NaN   
142189  2017-06-21    Uluru      2.8     23.4       0.0          NaN   
142190  2017-06-22    Uluru      3.6     25.3       0.0          NaN   
142191  2017-06-23    Uluru      5.4     26.9       0.0          NaN   
142192  2017-06-24    Uluru      7.8     27.0       0.0          NaN   

        Sunshine WindGustDir  WindGustSpeed WindDir9am  ... Humidity9am  \
0            NaN           W           44.0          W  ... 

In [52]:
Y = data.iloc[:,-1]

In [53]:
print(Y)

0         No
1         No
2         No
3         No
4         No
          ..
142188    No
142189    No
142190    No
142191    No
142192    No
Name: RainTomorrow, Length: 142193, dtype: object


# Handling missing data

In [54]:
import numpy as np
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

In [55]:
numerical_cols = [2,3,4,5,6,8,11,12,13,14,15,16,17,18,19,20]

In [56]:
#print(X)

In [57]:
imp_mean.fit(X.iloc[:,numerical_cols])

SimpleImputer()

In [58]:
X.iloc[:,numerical_cols] = imp_mean.transform(X.iloc[:,numerical_cols])

### Handling missing string data

In [59]:
string_cols = [1,7,9,10,21]

In [60]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [61]:
imp_mean.fit(X.iloc[:,string_cols])

SimpleImputer(strategy='most_frequent')

In [62]:
X.iloc[:,string_cols] = imp_mean.transform(X.iloc[:,string_cols])

# Feature selection

In [63]:
X.drop('RISK_MM', axis=1, inplace=True)

In [64]:
X.drop('Date', axis=1, inplace=True)

In [65]:
print(X)

       Location  MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  \
0        Albury     13.4     22.9       0.6     5.469824  7.624853   
1        Albury      7.4     25.1       0.0     5.469824  7.624853   
2        Albury     12.9     25.7       0.0     5.469824  7.624853   
3        Albury      9.2     28.0       0.0     5.469824  7.624853   
4        Albury     17.5     32.3       1.0     5.469824  7.624853   
...         ...      ...      ...       ...          ...       ...   
142188    Uluru      3.5     21.8       0.0     5.469824  7.624853   
142189    Uluru      2.8     23.4       0.0     5.469824  7.624853   
142190    Uluru      3.6     25.3       0.0     5.469824  7.624853   
142191    Uluru      5.4     26.9       0.0     5.469824  7.624853   
142192    Uluru      7.8     27.0       0.0     5.469824  7.624853   

       WindGustDir  WindGustSpeed WindDir9am WindDir3pm  ...  WindSpeed3pm  \
0                W           44.0          W        WNW  ...          24.0   
1  

# Label encoding

In [66]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [67]:
le.fit(Y)

LabelEncoder()

In [68]:
Y = le.transform(Y)

# One hot encoding

In [69]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [70]:
columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), [0,6,8,9,20])], remainder='passthrough')

In [71]:
X = columnTransformer.fit_transform(X)

In [72]:
print(X.shape)

(142193, 115)


# Handling imbalanced data

## Undersampling

In [73]:
#from imblearn.under_sampling import RandomUnderSampler 

In [74]:
#rus = RandomUnderSampler(random_state=42)
#X, Y = rus.fit_sample(X, Y)

In [75]:
#print(Y)

##  Oversampling

In [76]:
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler

In [77]:
from collections import Counter

In [78]:
Counter(Y)

Counter({0: 110316, 1: 31877})

In [79]:
rus = RandomOverSampler(random_state=42)
X, Y = rus.fit_sample(X, Y)

In [80]:
Counter(Y)

Counter({0: 110316, 1: 110316})

# Train test split

In [81]:
import numpy as np
from sklearn.model_selection import train_test_split

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=1)

In [83]:
print(X_train.shape)

(176505, 115)


# Feature scaling

### Standardization: (X - mean(X)) / std(X)

### Normalization = (x - min(x)) / (max(x) - min(x))

In [84]:
from sklearn import preprocessing

In [87]:
sc = preprocessing.StandardScaler()

In [88]:
sc.fit(X_train)

StandardScaler(with_mean=False)

In [89]:
X_train = sc.transform(X_train)

In [90]:
print(X_train.shape)

(176505, 115)


In [91]:
X_test = sc.transform(X_test)

In [92]:
print(X_test.shape)

(44127, 115)


#### The data is ready!!