In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
weather_data = pd.read_csv('weather.csv')


In [3]:
w = weather_data[['Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustDir', 
                       'WindGustSpeed', 'WindDir9am', 'WindDir3pm', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 
                       'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 
                       'RainToday']]

In [4]:
w.RainToday.unique()

array(['No', 'Yes', nan], dtype=object)

In [5]:
weather_data.WindGustDir.unique()

array(['W', 'WNW', 'WSW', 'NE', 'NNW', 'N', 'NNE', 'SW', 'ENE', 'SSE',
       'S', 'NW', 'SE', 'ESE', nan, 'E', 'SSW'], dtype=object)

In [6]:
weather_data.WindGustDir.head()

0      W
1    WNW
2    WSW
3     NE
4      W
Name: WindGustDir, dtype: object

In [7]:
weather_data['RainToday'].replace({'No': 0, 'Yes': 1},inplace = True)
weather_data['RainTomorrow'].replace({'No': 0, 'Yes': 1},inplace = True)



In [8]:
from sklearn.utils import resample

no = weather_data[weather_data.RainTomorrow == 0]
yes = weather_data[weather_data.RainTomorrow == 1]
yes_oversampled = resample(yes, replace=True, n_samples=len(no), random_state=123)
w_oversampled = pd.concat([no, yes_oversampled])



In [9]:
total = w_oversampled.isnull().sum().sort_values(ascending=False)
percent = (w_oversampled.isnull().sum()/w_oversampled.isnull().count()).sort_values(ascending=False)
missing = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])


In [10]:
w_oversampled.select_dtypes(include=['object']).columns

Index(['Date', 'Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm'], dtype='object')

In [11]:
w_oversampled['Date'] = w_oversampled['Date'].fillna(w_oversampled['Date'].mode()[0])
w_oversampled['Location'] = w_oversampled['Location'].fillna(w_oversampled['Location'].mode()[0])
w_oversampled['WindGustDir'] = w_oversampled['WindGustDir'].fillna(w_oversampled['WindGustDir'].mode()[0])
w_oversampled['WindDir9am'] = w_oversampled['WindDir9am'].fillna(w_oversampled['WindDir9am'].mode()[0])
w_oversampled['WindDir3pm'] = w_oversampled['WindDir3pm'].fillna(w_oversampled['WindDir3pm'].mode()[0])

In [12]:
from sklearn.preprocessing import LabelEncoder
lencoders = {}
for col in w_oversampled.select_dtypes(include=['object']).columns:
    lencoders[col] = LabelEncoder()
    w_oversampled[col] = lencoders[col].fit_transform(w_oversampled[col])

In [13]:
import warnings
warnings.filterwarnings("ignore")


# Multiple Imputation by Chained Equations
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
MiceImputed = w_oversampled.copy(deep=True) 
mice_imputer = IterativeImputer()
MiceImputed.iloc[:, :] = mice_imputer.fit_transform(w_oversampled)

In [14]:
# Detecting outliers with IQR
Q1 = MiceImputed.quantile(0.25)
Q3 = MiceImputed.quantile(0.75)
IQR = Q3 - Q1
print(IQR)

Date             1712.000000
Location           25.000000
MinTemp             9.300000
MaxTemp            10.200000
Rainfall            2.400000
Evaporation         4.068832
Sunshine            5.962861
WindGustDir         9.000000
WindGustSpeed      19.000000
WindDir9am          8.000000
WindDir3pm          8.000000
WindSpeed9am       13.000000
WindSpeed3pm       11.000000
Humidity9am        26.000000
Humidity3pm        30.000000
Pressure9am         8.800000
Pressure3pm         8.800000
Cloud9am            4.000000
Cloud3pm            3.687747
Temp9am             9.300000
Temp3pm             9.800000
RainToday           1.000000
RISK_MM             5.200000
RainTomorrow        1.000000
dtype: float64


In [15]:
# Removing outliers from the dataset
MiceImputed = MiceImputed[~((MiceImputed < (Q1 - 1.5 * IQR)) |(MiceImputed > (Q3 + 1.5 * IQR))).any(axis=1)]
MiceImputed.shape

(156703, 24)

In [16]:
features = MiceImputed[['Location', 'MinTemp', 'MaxTemp', 'Sunshine', 'WindGustDir', 
                       'WindGustSpeed', 'WindDir9am', 'WindDir3pm', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 
                       'Humidity3pm', 'Cloud3pm', 'Temp9am', 'Temp3pm']]


# features2 = MiceImputed[["Sunshine", "Humidity9am", "Cloud3pm"]]

target = MiceImputed['RainTomorrow']

# Split into test and train
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=12345)


# X_train2, X_test2, y_train2, y_test2 = train_test_split(features2, target, test_size=0.25, random_state=12345)



# Normalize Features
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()



# X_train = scaler.fit_transform(X_train)
# X_test = scaler.fit_transform(X_test)

# X_train2 = scaler.fit_transform(X_train2)
# X_test2 = scaler.fit_transform(X_test2)



In [17]:
X_train.iloc[19, :]

Location         30.000000
MinTemp           7.100000
MaxTemp          18.200000
Sunshine          5.671103
WindGustDir       5.000000
WindGustSpeed    17.000000
WindDir9am        3.000000
WindDir3pm       13.000000
WindSpeed9am      0.000000
WindSpeed3pm      0.000000
Humidity9am      97.000000
Humidity3pm      61.000000
Cloud3pm          4.767781
Temp9am          11.400000
Temp3pm          18.100000
Name: 24072, dtype: float64

In [18]:
# import time
# from sklearn.metrics import accuracy_score, roc_auc_score, cohen_kappa_score, plot_confusion_matrix, roc_curve, classification_report
# def run_model(model, X_train, y_train, X_test, y_test, verbose=True):
#     t0=time.time()
#     if verbose == False:
#         model.fit(X_train,y_train, verbose=0)
#     else:
#         model.fit(X_train,y_train)
#     y_pred = model.predict(X_test)
#     accuracy = accuracy_score(y_test, y_pred)
#     roc_auc = roc_auc_score(y_test, y_pred) 
#     coh_kap = cohen_kappa_score(y_test, y_pred)
#     time_taken = time.time()-t0
#     print("Accuracy = {}".format(accuracy))
#     print("ROC Area under Curve = {}".format(roc_auc))
#     print("Cohen's Kappa = {}".format(coh_kap))
#     print("Time taken = {}".format(time_taken))
#     print(classification_report(y_test,y_pred,digits=5))
    
# #     probs = model.predict_proba(X_test)  
# #     probs = probs[:, 1]  
# #     fper, tper, thresholds = roc_curve(y_test, probs) 
# #     plot_roc_cur(fper, tper)
    
#     plot_confusion_matrix(model, X_test, y_test,cmap=plt.cm.Blues, normalize = 'all')
    
#     return model, accuracy, roc_auc, coh_kap, time_taken

In [19]:
# XGBoost
import xgboost as xgb

model_x = xgb.XGBClassifier(n_estimators = 500, max_depth = 16)

model_x.fit(X_train, y_train)

pred = model_x.predict(X_test)

model_x.score(X_test, y_test)
    


# model_xgb, accuracy_xgb, roc_auc_xgb, coh_kap_xgb, tt_xgb = run_model(model_xgb, X_train1, y_train1, X_test1, y_test1)



0.9589544619154584

In [20]:
import pickle

pickle.dump(model_x, open("model.pkl", "wb"))

model = pickle.load(open("model.pkl", "rb"))

