# Data: https://www.kaggle.com/jsphyg/weather-dataset-rattle-package?select=weatherAUS.csv

# Import data

In [1]:
import pandas as pd 
import seaborn as sns

In [2]:
data = pd.read_csv('weatherAUS - tiny.csv')

In [3]:
data.tail()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
9994,04/01/2012,CoffsHarbour,19.6,28.6,0.0,7.4,10.0,NE,56.0,NNW,...,57.0,1015.9,1011.6,1.0,1.0,24.9,26.5,No,0.6,No
9995,05/01/2012,CoffsHarbour,21.3,26.5,0.6,7.6,6.4,NNE,31.0,S,...,70.0,1016.1,1014.2,6.0,7.0,24.6,24.8,No,0.0,No
9996,06/01/2012,CoffsHarbour,18.4,27.6,0.0,5.0,10.6,SSW,56.0,N,...,67.0,1011.5,1012.4,1.0,6.0,25.3,25.9,No,0.0,No
9997,07/01/2012,CoffsHarbour,18.3,26.1,0.0,7.6,9.0,SW,28.0,SW,...,63.0,1015.6,1013.1,3.0,7.0,22.9,24.7,No,0.0,No
9998,08/01/2012,CoffsHarbour,21.4,29.2,0.0,5.8,12.8,NNE,61.0,N,...,64.0,1010.8,1006.6,1.0,4.0,26.0,27.8,No,2.0,Yes


In [4]:
X = data.iloc[:,:-1]

In [5]:
X.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM
0,01/12/2008,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0
1,02/12/2008,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0
2,03/12/2008,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0
3,04/12/2008,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0
4,05/12/2008,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2


# Only for this dataset, delete otherwise (Weather AUS)

X.drop('RISK_MM', axis=1, inplace=True)

In [6]:
X.drop('RISK_MM', axis=1, inplace=True)

In [7]:
y = data.iloc[:,-1]

In [8]:
y.head()

0    No
1    No
2    No
3    No
4    No
Name: RainTomorrow, dtype: object

# Handling missing data - Numeric type

In [9]:
import numpy as np
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

In [10]:
#X.dtypes

In [11]:
numerical_cols = list(np.where((X.dtypes == np.int64) | (X.dtypes == np.float64))[0])

In [12]:
imp_mean.fit(X.iloc[:,numerical_cols])

SimpleImputer()

In [13]:
X.iloc[:,numerical_cols] = imp_mean.transform(X.iloc[:,numerical_cols])

### Handling missing string data

In [14]:
string_cols = list(np.where((X.dtypes == np.object))[0])

In [15]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [16]:
imp_mean.fit(X.iloc[:,string_cols])

SimpleImputer(strategy='most_frequent')

In [17]:
X.iloc[:,string_cols] = imp_mean.transform(X.iloc[:,string_cols])

# One Hot encoder method

In [18]:
def OneHotEncoderMethod(indices, data):
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OneHotEncoder
    columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(),indices )], remainder='passthrough')
    return columnTransformer.fit_transform(data)

# Label encoding method

In [19]:
def LabelEncoderMethod(series):
    from sklearn import preprocessing
    le = preprocessing.LabelEncoder()
    le.fit(series)
    #print('Actual labels',le.classes_)
    #print('Encoding values',le.transform(pd.unique(series)))
    return le.transform(series) 

# Label encoding target feature

In [20]:
y = LabelEncoderMethod(y)

# Encoding selection for X

In [21]:
def EncodingSelection(X, threshold=10):
    # Step 01 : Select the string col
    string_cols = list(np.where((X.dtypes == np.object))[0])
    one_hot_encoding_indices = []
    
    # Step 02: The number of categoty is 2 and more than threshold, label encode
    for col in string_cols:
        lenght = len(pd.unique(X[X.columns[col]]))
        if lenght == 2 or lenght > threshold:
            X[X.columns[col]] = LabelEncoderMethod(X[X.columns[col]])
        else:
            one_hot_encoding_indices.append(col)
            
    # Step 03: One hot encode otherwise 
    X = OneHotEncoderMethod(one_hot_encoding_indices, X)
    return X

In [22]:
X = EncodingSelection(X)

In [23]:
X.shape

(9999, 25)

# Feature selection

In [24]:
from sklearn.feature_selection import SelectKBest, chi2

In [25]:
kbest = SelectKBest(score_func=chi2, k=10)

In [26]:
from sklearn import preprocessing
MMS = preprocessing.MinMaxScaler()

In [27]:
K_features = 5

In [28]:
x_temp = MMS.fit_transform(X)

In [29]:
x_temp = kbest.fit(x_temp,y)

In [30]:
best_features = np.argsort(x_temp.scores_)[-K_features:]

In [31]:
#best_features

In [32]:
features_to_delete = best_features = np.argsort(x_temp.scores_)[:-K_features]

In [33]:
X = np.delete(X, features_to_delete, axis=1)

In [34]:
X.shape

(9999, 5)

In [35]:
del x_temp

# Train test split

In [36]:
import numpy as np
from sklearn.model_selection import train_test_split

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

In [38]:
print(X_train.shape)

(7999, 5)


# Feature scaling

### Standardization: (X - mean(X)) / std(X)

### Normalization = (x - min(x)) / (max(x) - min(x))

In [39]:
from sklearn import preprocessing

In [40]:
sc = preprocessing.StandardScaler(with_mean=False)

In [41]:
sc.fit(X_train)

StandardScaler(with_mean=False)

In [42]:
X_train = sc.transform(X_train)

In [43]:
print(X_train.shape)

(7999, 5)


In [44]:
X_test = sc.transform(X_test)

In [45]:
print(X_test.shape)

(2000, 5)


#### The data is ready!!

In [46]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Building KNN model

In [47]:
from sklearn.neighbors import KNeighborsClassifier
knnClassifier = KNeighborsClassifier(n_neighbors=3)
knnClassifier.fit(X_train,y_train)
y_pred = knnClassifier.predict(X_test)
accuracy_score(y_pred,y_test)

0.8455

# Building Logistic regression model

In [48]:
from sklearn.linear_model import LogisticRegression
LRM = LogisticRegression(random_state=0,max_iter=200)
LRM.fit(X_train,y_train)
y_pred = LRM.predict(X_test)
accuracy_score(y_pred,y_test)

0.869

# Building GaussianNB model

In [49]:
from sklearn.naive_bayes import GaussianNB
model_GNB = GaussianNB()
model_GNB.fit(X_train,y_train)
y_pred = model_GNB.predict(X_test)
accuracy_score(y_pred,y_test)

0.822

# Building SVM (SVC) model

In [50]:
from sklearn.svm import SVC
model_SVC = SVC()
model_SVC.fit(X_train,y_train)
y_pred = model_SVC.predict(X_test)
accuracy_score(y_pred,y_test)

0.87

# Building Decision tree model

In [51]:
from sklearn.tree import DecisionTreeClassifier
model_DTC = DecisionTreeClassifier()
model_DTC.fit(X_train,y_train)
y_pred = model_DTC.predict(X_test)
accuracy_score(y_pred,y_test)

0.8315

# Building Random Forest model

In [52]:
from sklearn.ensemble import RandomForestClassifier
model_RFC = RandomForestClassifier(n_estimators = 500, max_depth= 5)
model_RFC.fit(X_train,y_train)
y_pred = model_RFC.predict(X_test)
accuracy_score(y_pred,y_test)

0.8735

# Building ADABoost model

In [53]:
from sklearn.ensemble import AdaBoostClassifier
model_ABC = AdaBoostClassifier()
model_ABC.fit(X_train,y_train)
y_pred = model_ABC.predict(X_test)
accuracy_score(y_pred,y_test)

0.8715

# Building XGBoost model

In [54]:
import xgboost as xgb
model_xgb = xgb.XGBClassifier(use_label_encoder=False)
model_xgb.fit(X_train,y_train)
y_pred = model_xgb.predict(X_test)
accuracy_score(y_pred,y_test)



0.865

In [55]:
# Classifier object names

    # knnClassifier
    # LRM
    # model_GNB
    # model_SVC
    # model_DTC
    # model_RFC 
    # model_ABC
    # model_xgb


In [56]:
import pickle

In [57]:
# file_name = 'model_xgb'
# pickle.dump(model_xgb, open(file_name, 'wb'))

In [58]:
saved_model = pickle.load(open('model_xgb', 'rb'))

In [59]:
y_pred = saved_model.predict(X_test)
accuracy_score(y_pred,y_test)

0.865

In [63]:
X_test[99]

array([0.        , 2.9304896 , 1.95100361, 2.12694147, 0.        ])