In [10]:
import pandas as pd
import matplotlib as plt
import scipy.stats as stats
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.impute import KNNImputer, SimpleImputer

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report, plot_confusion_matrix, plot_roc_curve


from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier 
from sklearn import tree

In [2]:
df = pd.read_csv('data/weatherAUS.csv')
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [3]:
df["Date"] = pd.to_datetime(df["Date"])
df["Month"] = df["Date"].dt.month

In [4]:
df.dropna(subset = ['RainTomorrow'], inplace = True)

In [5]:
df.drop(columns = ['Evaporation', 'Date', 'Sunshine'], inplace=True, axis=1)

In [6]:
df.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Month
0,Albury,13.4,22.9,0.6,W,44.0,W,WNW,20.0,24.0,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No,12
1,Albury,7.4,25.1,0.0,WNW,44.0,NNW,WSW,4.0,22.0,...,25.0,1010.6,1007.8,,,17.2,24.3,No,No,12
2,Albury,12.9,25.7,0.0,WSW,46.0,W,WSW,19.0,26.0,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No,12
3,Albury,9.2,28.0,0.0,NE,24.0,SE,E,11.0,9.0,...,16.0,1017.6,1012.8,,,18.1,26.5,No,No,12
4,Albury,17.5,32.3,1.0,W,41.0,ENE,NW,7.0,20.0,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No,12


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 142193 entries, 0 to 145458
Data columns (total 21 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Location       142193 non-null  object 
 1   MinTemp        141556 non-null  float64
 2   MaxTemp        141871 non-null  float64
 3   Rainfall       140787 non-null  float64
 4   WindGustDir    132863 non-null  object 
 5   WindGustSpeed  132923 non-null  float64
 6   WindDir9am     132180 non-null  object 
 7   WindDir3pm     138415 non-null  object 
 8   WindSpeed9am   140845 non-null  float64
 9   WindSpeed3pm   139563 non-null  float64
 10  Humidity9am    140419 non-null  float64
 11  Humidity3pm    138583 non-null  float64
 12  Pressure9am    128179 non-null  float64
 13  Pressure3pm    128212 non-null  float64
 14  Cloud9am       88536 non-null   float64
 15  Cloud3pm       85099 non-null   float64
 16  Temp9am        141289 non-null  float64
 17  Temp3pm        139467 non-nul

In [8]:
df2 = df.copy()

In [9]:
features = ['MinTemp', 'MaxTemp', 'Rainfall', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
            'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Temp9am', 'Temp3pm']
for feature in features:
    qupper = df2[feature].quantile(0.99)
    df2 = df2.loc[(df2[feature].isna()) | (df2[feature] < qupper)]

KeyboardInterrupt: 

In [None]:
df2.info()

In [None]:
df2.isna().sum().sum()

In [None]:
X = df2.drop('RainTomorrow', axis=1)
y = df2['RainTomorrow']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
cols_simp = ['WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']
simputer = SimpleImputer(strategy = 'most_frequent')
simputed = simputer.fit_transform(X_train[cols_simp])
simptest = simputer.transform(X_test[cols_simp])
df_simp_tranformed = pd.DataFrame(simputed, index=X_train.index, columns=cols_simp)
df_simp_tranformed


In [None]:
X_train.drop(columns = cols_simp, axis=1, inplace=True)

In [None]:
X_train_imp = pd.concat((X_train, df_simp_tranformed), axis=1)

In [None]:
X_train_imp

In [None]:
cols_encode = ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'Month']
ohe = OneHotEncoder(sparse = False, handle_unknown='ignore')
ohe_fit = ohe.fit_transform(X_train_imp[cols_encode])
ohe_df = pd.DataFrame(ohe_fit, columns = ohe.get_feature_names(cols_encode), index = X_train_imp.index)
ohe_df

In [None]:
X_train_int = X_train_imp.drop(cols_encode, axis=1)

In [None]:
X_train_ohe = pd.concat((X_train_int, ohe_df), axis=1)
X_train_ohe

In [None]:
#imputer = KNNImputer(n_neighbors=5, weights="uniform")
#imputed = imputer.fit_transform(X_train_ohe)
#X_train_KNNI = pd.DataFrame(imputed, index = X_train_ohe.index, columns = X_train_ohe.columns)


In [13]:
X_train_KNNI = pd.read_csv("data/X_train_KNNI.csv", index_col=0)
X_test_KNNI = pd.read_csv("data/X_test_KNNI.csv")

In [14]:
X_train_KNNI

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,...,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Month_11,Month_12
42305,12.8,23.5,0.0,7.600000,44.0,13.0,33.0,50.0,64.0,1014.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
20387,13.8,18.5,0.0,8.402434,54.0,11.0,19.0,75.0,84.0,1022.9,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
32951,15.5,24.9,0.0,7.600000,39.0,7.0,20.0,56.0,52.0,1022.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
91385,21.1,29.9,0.0,8.800000,35.0,20.0,30.0,66.0,74.0,1017.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
115228,11.1,17.4,9.8,5.100000,48.0,30.0,19.0,65.0,42.0,1021.1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141951,21.1,32.5,0.0,10.500000,43.0,22.0,13.0,42.0,20.0,1015.3,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
123095,14.2,19.4,33.0,2.900000,61.0,11.0,24.0,87.0,70.0,1006.8,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
940,-1.4,14.7,0.0,4.600000,17.0,2.0,9.0,92.0,52.0,1028.9,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
18334,8.8,14.7,0.2,9.250038,43.0,15.0,31.0,53.0,66.0,1023.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [16]:
y_train

NameError: name 'y_train' is not defined

In [None]:
scale_cols = ['MinTemp', 'MaxTemp', 'Rainfall', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
            'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Temp9am', 'Temp3pm','Cloud9am','Cloud3pm']
scale_feats = X_train_KNNI[scale_cols]
scaler = StandardScaler().fit_transform(scale_feats.values)
X_train_KNNI[scale_cols] = scaler
X_train_scaled = X_train_KNNI.copy()

In [None]:
X_train_scaled.info()

# Noodeling

## Baseline

In [None]:
baseline = DummyClassifier(random_state=42)
baseline.fit(X_train_KNNI, y_train)

plot_confusion_matrix(baseline, X_train_KNNI, y_train, cmap='rocket');

In [None]:
plot_roc_curve(baseline, X_train_KNNI, y_train);
baseline_accuracy = accuracy_score(y_train, baseline.predict(X_train_KNNI))
baseline_recall = recall_score(y_train, baseline.predict(X_train_KNNI),pos_label="Yes")
baseline_precision = precision_score(y_train, baseline.predict(X_train_KNNI),pos_label="Yes")
baseline_f1 = f1_score(y_train, baseline.predict(X_train_KNNI),pos_label="Yes")
baselinecrossval = cross_val_score(baseline, X_train_KNNI, y_train)
print(f"""
Accuracy
Baseline: {baseline_accuracy:1.3f} 
Recall
Baseline: {baseline_recall:1.3f} 
Precision
Baseline: {baseline_precision:1.3f} 
F1 Score
Baseline: {baseline_f1:1.3f}
Cross Val Scores
Baseline:{baselinecrossval.round(3)}
""")



## Simple Logistic Regression

In [None]:
logreg_model = LogisticRegression(random_state=42, max_iter=1000)
logreg_model.fit(X_train_scaled, y_train)
plot_confusion_matrix(logreg_model, X_train_scaled, y_train, cmap='rocket');

In [None]:
plot_roc_curve(logreg_model, X_train_scaled, y_train);
logreg_model_accuracy = accuracy_score(y_train, logreg_model.predict(X_train_scaled))
logreg_model_recall = recall_score(y_train, logreg_model.predict(X_train_scaled),pos_label="Yes")
logreg_model_precision = precision_score(y_train, logreg_model.predict(X_train_scaled),pos_label="Yes")
logreg_model_f1 = f1_score(y_train, logreg_model.predict(X_train_scaled),pos_label="Yes")
logreg_model_crossval = cross_val_score(logreg_model, X_train_scaled, y_train)

print(f"""
Accuracy
Simple Logistic Regression: {logreg_model_accuracy:1.3f} 
Recall
Simple Logistic Regression: {logreg_model_recall:1.3f} 
Precision
Simple Logistic Regression: {logreg_model_precision:1.3f} 
F1 Score
Simple Logistic Regression: {logreg_model_f1:1.3f}
Cross Val Scores
Simple Logistic:{logreg_model_crossval.round(3)}
""")

## Using SMOTE to Balance

In [None]:
print('Raw counts: \n')
print(y_train.value_counts())
print('-----------------------------------')
print('Normalized counts: \n')
print(y_train.value_counts(normalize=True))

In [None]:
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_sample(X_train_scaled, y_train)

In [None]:
print('Original class distribution: \n')
print(y_train.value_counts())
print('-----------------------------------------')
print('Synthetic sample class distribution: \n')
print(pd.Series(y_train_resampled).value_counts()) 

In [None]:
SMOTE_model = LogisticRegression(random_state=42, max_iter=1000)
SMOTE_model.fit(X_train_resampled, y_train_resampled)
plot_confusion_matrix(SMOTE_model, X_train_resampled, y_train_resampled, cmap='rocket');

In [None]:
plot_roc_curve(SMOTE_model, X_train_resampled, y_train_resampled);
SMOTE_model_accuracy = accuracy_score(y_train_resampled, logreg_model.predict(X_train_resampled))
SMOTE_model_recall = recall_score(y_train_resampled, logreg_model.predict(X_train_resampled),pos_label="Yes")
SMOTE_model_precision = precision_score(y_train_resampled, logreg_model.predict(X_train_resampled),pos_label="Yes")
SMOTE_model_f1 = f1_score(y_train_resampled, logreg_model.predict(X_train_resampled),pos_label="Yes")
SMOTE_model_crossval = cross_val_score(SMOTE_model, X_train_resampled, y_train_resampled)

print(f"""
Accuracy
SMOTE Model: {SMOTE_model_accuracy:1.3f} 
Recall
SMOTE Model: {SMOTE_model_recall:1.3f} 
Precision
SMOTE Model: {SMOTE_model_precision:1.3f} 
F1 Score
SMOTE Model: {SMOTE_model_f1:1.3f} 
Cross Val Scores
SMOTE Model:{SMOTE_model_crossval.round(3)}
""")

## Gettin' Griddy Wit It

In [None]:
grid = {
    'solver': ['lbfgs', 'sag','saga'],
    'max_iter': [1000]
}

In [None]:
#gs = GridSearchCV(estimator=SMOTE_model, param_grid=grid, cv=5)
#gs.fit(X_train_resampled, y_train_resampled)
#gs.best_params_

In [None]:
#gs.cv_results_

In [None]:
X_train_resampled