# Data: https://www.kaggle.com/jsphyg/weather-dataset-rattle-package?select=weatherAUS.csv

# Import data

In [241]:
import pandas as pd 
import seaborn as sns

In [242]:
data = pd.read_csv('weatherAUS - tiny.csv')

In [243]:
data.tail()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
9994,04/01/2012,CoffsHarbour,19.6,28.6,0.0,7.4,10.0,NE,56.0,NNW,...,57.0,1015.9,1011.6,1.0,1.0,24.9,26.5,No,0.6,No
9995,05/01/2012,CoffsHarbour,21.3,26.5,0.6,7.6,6.4,NNE,31.0,S,...,70.0,1016.1,1014.2,6.0,7.0,24.6,24.8,No,0.0,No
9996,06/01/2012,CoffsHarbour,18.4,27.6,0.0,5.0,10.6,SSW,56.0,N,...,67.0,1011.5,1012.4,1.0,6.0,25.3,25.9,No,0.0,No
9997,07/01/2012,CoffsHarbour,18.3,26.1,0.0,7.6,9.0,SW,28.0,SW,...,63.0,1015.6,1013.1,3.0,7.0,22.9,24.7,No,0.0,No
9998,08/01/2012,CoffsHarbour,21.4,29.2,0.0,5.8,12.8,NNE,61.0,N,...,64.0,1010.8,1006.6,1.0,4.0,26.0,27.8,No,2.0,Yes


In [244]:
X = data.iloc[:,:-1]

In [245]:
X.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM
0,01/12/2008,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0
1,02/12/2008,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0
2,03/12/2008,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0
3,04/12/2008,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0
4,05/12/2008,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2


# Only for this dataset, delete otherwise (Weather AUS)

X.drop('RISK_MM', axis=1, inplace=True)

In [246]:
X.drop('RISK_MM', axis=1, inplace=True)

In [247]:
y = data.iloc[:,-1]

In [248]:
y.head()

0    No
1    No
2    No
3    No
4    No
Name: RainTomorrow, dtype: object

# Handling missing data - Numeric type

In [249]:
import numpy as np
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

In [250]:
#X.dtypes

In [251]:
numerical_cols = list(np.where((X.dtypes == np.int64) | (X.dtypes == np.float64))[0])

In [252]:
imp_mean.fit(X.iloc[:,numerical_cols])

SimpleImputer()

In [253]:
X.iloc[:,numerical_cols] = imp_mean.transform(X.iloc[:,numerical_cols])

### Handling missing string data

In [254]:
string_cols = list(np.where((X.dtypes == np.object))[0])

In [255]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [256]:
imp_mean.fit(X.iloc[:,string_cols])

SimpleImputer(strategy='most_frequent')

In [257]:
X.iloc[:,string_cols] = imp_mean.transform(X.iloc[:,string_cols])

# One Hot encoder method

In [258]:
def OneHotEncoderMethod(indices, data):
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OneHotEncoder
    columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(),indices )], remainder='passthrough')
    return columnTransformer.fit_transform(data)

# Label encoding method

In [259]:
def LabelEncoderMethod(series):
    from sklearn import preprocessing
    le = preprocessing.LabelEncoder()
    le.fit(series)
    #print('Actual labels',le.classes_)
    #print('Encoding values',le.transform(pd.unique(series)))
    return le.transform(series) 

# Label encoding target feature

In [260]:
y = LabelEncoderMethod(y)

# Encoding selection for X

In [261]:
def EncodingSelection(X, threshold=10):
    # Step 01 : Select the string col
    string_cols = list(np.where((X.dtypes == np.object))[0])
    one_hot_encoding_indices = []
    
    # Step 02: The number of categoty is 2 and more than threshold, label encode
    for col in string_cols:
        lenght = len(pd.unique(X[X.columns[col]]))
        if lenght == 2 or lenght > threshold:
            X[X.columns[col]] = LabelEncoderMethod(X[X.columns[col]])
        else:
            one_hot_encoding_indices.append(col)
            
    # Step 03: One hot encode otherwise 
    X = OneHotEncoderMethod(one_hot_encoding_indices, X)
    return X

In [262]:
X = EncodingSelection(X)

In [263]:
X.shape

(9999, 25)

# Feature selection

In [264]:
from sklearn.feature_selection import SelectKBest, chi2

In [265]:
kbest = SelectKBest(score_func=chi2, k=10)

In [266]:
from sklearn import preprocessing
MMS = preprocessing.MinMaxScaler()

In [267]:
K_features = 10

In [268]:
x_temp = MMS.fit_transform(X)

In [269]:
x_temp = kbest.fit(x_temp,y)

In [270]:
best_features = np.argsort(x_temp.scores_)[-K_features:]

In [271]:
best_features

array([11, 23, 16,  7,  2, 21,  3, 20, 17, 24], dtype=int64)

In [272]:
features_to_delete = best_features = np.argsort(x_temp.scores_)[:-K_features]

In [273]:
X = np.delete(X, features_to_delete, axis=1)

In [274]:
X.shape

(9999, 10)

In [275]:
del x_temp

# Train test split

In [276]:
import numpy as np
from sklearn.model_selection import train_test_split

In [277]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

In [278]:
print(X_train.shape)

(7999, 10)


# Feature scaling

### Standardization: (X - mean(X)) / std(X)

### Normalization = (x - min(x)) / (max(x) - min(x))

In [279]:
from sklearn import preprocessing

In [280]:
sc = preprocessing.StandardScaler(with_mean=False)

In [281]:
sc.fit(X_train)

StandardScaler(with_mean=False)

In [282]:
X_train = sc.transform(X_train)

In [283]:
print(X_train.shape)

(7999, 10)


In [284]:
X_test = sc.transform(X_test)

In [285]:
print(X_test.shape)

(2000, 10)


#### The data is ready!!

In [286]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Building Logistic regression model

In [287]:
from sklearn.linear_model import LogisticRegression
LRM = LogisticRegression(random_state=0,max_iter=500)
LRM.fit(X_train,y_train)
y_pred = LRM.predict(X_test)
accuracy_score(y_pred,y_test)


0.872

### Optimal threshold value for Logistic regression

In [288]:
from sklearn.metrics import roc_curve, auc
predicted_probabilities = LRM.predict_proba(X_test)

In [289]:
predicted_probabilities

array([[0.98390812, 0.01609188],
       [0.55107343, 0.44892657],
       [0.97944452, 0.02055548],
       ...,
       [0.88761113, 0.11238887],
       [0.98289634, 0.01710366],
       [0.52026167, 0.47973833]])

In [290]:
fpr, tpr, thresholds = roc_curve(y_test,predicted_probabilities[:,1])

In [291]:
auc(fpr, tpr)

0.8841075175457638

In [300]:
thresholds.shape

(372,)

In [293]:
accuracies = []
for thresh in thresholds:
    _predictions = [1 if i >= thresh else 0 for i in predicted_probabilities[:, -1]]
    accuracies.append(accuracy_score(y_test, _predictions, normalize=True))

In [294]:
accuracies = pd.concat([pd.Series(thresholds), pd.Series(accuracies)],
                        axis=1)
accuracies.columns = ['threshold', 'accuracy']
accuracies.sort_values(by='accuracy', ascending=False, inplace=True)
accuracies.head()

Unnamed: 0,threshold,accuracy
78,0.547545,0.876
76,0.560424,0.8755
114,0.428764,0.8755
112,0.432886,0.8755
110,0.433176,0.8755


In [295]:
optimal_proba_cutoff = accuracies['threshold'].iloc[0]

In [296]:
roc_predictions = [1 if i >= optimal_proba_cutoff else 0 for i in predicted_probabilities[:, -1]]

In [297]:
print(classification_report(roc_predictions, y_test))

              precision    recall  f1-score   support

           0       0.97      0.89      0.93      1770
           1       0.48      0.77      0.59       230

    accuracy                           0.88      2000
   macro avg       0.72      0.83      0.76      2000
weighted avg       0.91      0.88      0.89      2000



In [298]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.96      0.89      0.92      1740
           1       0.51      0.73      0.60       260

    accuracy                           0.87      2000
   macro avg       0.73      0.81      0.76      2000
weighted avg       0.90      0.87      0.88      2000

