#  Rain tomorrow prediction

## Učitavanje biblioteka

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn import metrics
import tensorflow as tf
import time

## Funkcija za računanje metrika

In [None]:
#funkcija za računanje metrika
def metrike(modeli, testData, y_test, time):
    for ind, model in enumerate(modeli):
        print(F"Algoritam: {type(model.optimizer).__name__}")
        #ako je ANN
        y_pred = model.predict(testData)
        y_pred = (y_pred > 0.5)
        print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
        print('ROC AUC:', metrics.roc_auc_score(y_test, y_pred))
        print("Confusion matrix")
        CM = metrics.confusion_matrix(y_test, y_pred) #confusion matrica
        TN = CM[0, 0]
        TP = CM[1, 1]
        FP = CM[0, 1]
        FN = CM[1, 0]
        print("    P0           P1")
        print(f"S0  {TN}        {FP}")
        print(f"S1  {FN}        {TP}")
        print("Recall: ",metrics.recall_score(y_test,y_pred))
        print("Precision: ",  metrics.precision_score(y_test, y_pred))
        print("F1 score: ", metrics.f1_score(y_test, y_pred))
        print(f"Training time (sec): {time[ind]}" )
        print("-----------------------------------------------")
def metrike_table(modeli, testData, y_test, tr_time, histories):
    data = []
    for ind, model in enumerate(modeli):
        start = time.time()
        y_pred = model.predict(testData)
        y_pred = (y_pred > 0.5)
        test_time = time.time()-start
        data.append([
            type(model.optimizer).__name__,
            metrics.accuracy_score(y_test, y_pred),
            metrics.roc_auc_score(y_test, y_pred),
            metrics.recall_score(y_test,y_pred),
            metrics.precision_score(y_test, y_pred),
            metrics.f1_score(y_test, y_pred),
            tr_time[ind],
            test_time,
            len(histories[ind].history['loss'])
            ])
        plt.plot(pd.DataFrame(histories[ind].history["accuracy"]))
        plt.plot(pd.DataFrame(histories[ind].history["val_accuracy"]))
        plt.xlabel('Epochs')
        plt.ylabel('Accuracy')
        plt.title(f'{type(model.optimizer).__name__} accuracy')
        plt.figure(figsize=(6,6), dpi=500)
        plt.show()

        plt.plot(pd.DataFrame(histories[ind].history["loss"]))
        plt.plot(pd.DataFrame(histories[ind].history["val_loss"]))
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.title(f'{type(model.optimizer).__name__} loss')
        plt.figure(figsize=(6,6), dpi=500)
        plt.show()
    df = pd.DataFrame(data, columns = ['Algoritam', "Accuracy",'ROC AUC',"Recall","Precision","F1 score", "Training time (sec)","Test time (sec)",'Epochs'])
    return df

## Učitavanje skupa podataka

In [None]:
dataset = pd.read_csv('../datasets/rain_australia.csv')
dataset.head()

## Analiza skupa podataka

In [None]:
dataset.dtypes

In [None]:
dataset.describe()

In [None]:
dataset['RainTomorrow'].value_counts()

In [None]:
no_rain = dataset['RainTomorrow'].value_counts()[0]
rain = dataset['RainTomorrow'].value_counts()[1]

In [None]:
plt.figure(figsize=(5,5))
plt.bar(['Rain','No rain'],[rain,no_rain])
plt.ylabel("Number of days")
plt.title("Rain/no rain")
plt.show()

In [None]:
dataset['RainTomorrow'].value_counts()[1] /len(dataset)*100

In [None]:
dataset['RainTomorrow'] = dataset['RainTomorrow'].replace(['No','Yes'],[0,1])
dataset['RainToday'] = dataset['RainToday'].replace(['No','Yes'],[0,1])

In [None]:
dates = pd.to_datetime(dataset['Date'])
dataset['Day'] = dates.dt.day
dataset['Month'] = dates.dt.month
dataset['Year'] = dates.dt.year
dataset = dataset.drop('Date',axis=1)

In [None]:
korelacijska_matrica = dataset.corr()
plt.subplots(figsize=(10,10))
sns.heatmap(korelacijska_matrica, vmax=0.9, square=True)

## Čišćenje

In [None]:
dataset.isna().sum()

In [None]:
dataset.loc[:,dataset.columns ].dropna(axis=1, how='all').isna().sum()

In [None]:
dataset = dataset[dataset['RainToday'].notna()]
dataset.isna().sum()

In [None]:
dataset = dataset[dataset['RainTomorrow'].notna()]
dataset.isna().sum()

In [None]:
dataset.dtypes

In [None]:
object_elements = ['Location','WindGustDir','WindDir9am','WindDir3pm']
object_elements

In [None]:
from sklearn.impute import SimpleImputer
for el in object_elements:
    imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='Unknown')
    imputer.fit(dataset[[el]])
    dataset[[el]] = imputer.transform(dataset[[el]])

In [None]:
dataset.isna().sum()

In [None]:
dataset.head()

In [None]:
float_elements = []
for element in dataset.columns:
    if element not in object_elements:
        float_elements.append(element)
float_elements.remove('RainToday')
float_elements.remove('RainTomorrow')
print(float_elements)

In [None]:
from sklearn.impute import SimpleImputer
for el in float_elements:
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    imputer.fit(dataset[[el]])
    dataset[[el]] = imputer.transform(dataset[[el]])

In [None]:
dataset.head()

In [None]:
dataset.isna().sum()

In [None]:
korelacijska_matrica = dataset.corr()
plt.subplots(figsize=(10,10))
sns.heatmap(korelacijska_matrica, vmax=0.9, square=True)

In [None]:
print(korelacijska_matrica['RainTomorrow'].drop(['RainTomorrow']).sort_values())

In [None]:
X=dataset.drop(['RainTomorrow'], axis=1)
Y=dataset['RainTomorrow']

In [None]:
X.head()

In [None]:
print(object_elements)

## Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
for el in object_elements:
    le = LabelEncoder()
    X[el] = le.fit_transform(X[el])
le = LabelEncoder()
Y = le.fit_transform(Y)

In [None]:
X.head()

## Podjela na trening i test skupove

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state=42)

## Standardizacija

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## ANN - ADAM

In [None]:
ann_adam = tf.keras.models.Sequential()
#2 skirvena sloja sa po 16 i 32 neurona
ann_adam.add(tf.keras.layers.Dense(units=16, activation='relu'))
ann_adam.add(tf.keras.layers.Dense(units=32, activation='relu'))
# Dodavanje izlaznog sloja
ann_adam.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))
# kompajliranje mreže
ann_adam.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [None]:
start = time.time()
adam_history = ann_adam.fit(X_train, y_train, batch_size = 16, epochs = 19, validation_data=(X_test,y_test))
adam_time = time.time()-start

In [None]:
metrike([ann_adam], X_test, y_test, [adam_time])

In [None]:
plt.plot(pd.DataFrame(adam_history.history["accuracy"]))
plt.plot(pd.DataFrame(adam_history.history["val_accuracy"]))
plt.figure(figsize=(6,6), dpi=500)
plt.show()

plt.plot(pd.DataFrame(adam_history.history["loss"]))
plt.plot(pd.DataFrame(adam_history.history["val_loss"]))
plt.figure(figsize=(6,6), dpi=500)
plt.show()

## ANN - ADAMAX

In [None]:

ann_adamax = tf.keras.models.Sequential()
ann_adam.add(tf.keras.layers.Dense(units=16, activation='relu'))
ann_adam.add(tf.keras.layers.Dense(units=32, activation='relu'))
ann_adamax.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))
ann_adamax.compile(optimizer = 'adamax', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [None]:
start = time.time()
adamax_history = ann_adamax.fit(X_train, y_train, batch_size = 16, epochs = 35, validation_data=(X_test,y_test))
adamax_time = time.time()-start

In [None]:
metrike([ann_adamax], X_test, y_test, [adamax_time])

In [None]:
plt.plot(pd.DataFrame(adamax_history.history["accuracy"]))
plt.plot(pd.DataFrame(adamax_history.history["val_accuracy"]))
plt.figure(figsize=(6,6), dpi=500)
plt.show()

plt.plot(pd.DataFrame(adamax_history.history["loss"]))
plt.plot(pd.DataFrame(adamax_history.history["val_loss"]))
plt.figure(figsize=(6,6), dpi=500)
plt.show()

## ANN - ADAGRAD

In [None]:

ann_adagrad = tf.keras.models.Sequential()
ann_adagrad.add(tf.keras.layers.Dense(units=16, activation='relu'))
ann_adagrad.add(tf.keras.layers.Dense(units=32, activation='relu'))
ann_adagrad.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))
ann_adagrad.compile(optimizer = 'adagrad', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [None]:
start = time.time()
adagrad_history = ann_adagrad.fit(X_train, y_train, batch_size = 16, epochs = 150, validation_data=(X_test,y_test))
adagrad_time = time.time()-start

In [None]:
metrike([ann_adagrad], X_test, y_test, [adagrad_time])

In [None]:
plt.plot(pd.DataFrame(adagrad_history.history["accuracy"]))
plt.plot(pd.DataFrame(adagrad_history.history["val_accuracy"]))
plt.figure(figsize=(6,6), dpi=500)
plt.show()

plt.plot(pd.DataFrame(adagrad_history.history["loss"]))
plt.plot(pd.DataFrame(adagrad_history.history["val_loss"]))
plt.figure(figsize=(6,6), dpi=500)
plt.show()

## ANN - NADAM

In [None]:
ann_nadam = tf.keras.models.Sequential()
ann_nadam.add(tf.keras.layers.Dense(units=16, activation='relu'))
ann_nadam.add(tf.keras.layers.Dense(units=32, activation='relu'))
ann_nadam.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))
ann_nadam.compile(optimizer = 'nadam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [None]:
start = time.time()
nadam_history = ann_nadam.fit(X_train, y_train, batch_size = 16, epochs = 18, validation_data=(X_test,y_test))
nadam_time = time.time()-start

In [None]:
metrike([ann_nadam], X_test, y_test, [nadam_time])

In [None]:
plt.plot(pd.DataFrame(nadam_history.history["accuracy"]))
plt.plot(pd.DataFrame(nadam_history.history["val_accuracy"]))
plt.figure(figsize=(6,6), dpi=500)
plt.show()

plt.plot(pd.DataFrame(nadam_history.history["loss"]))
plt.plot(pd.DataFrame(nadam_history.history["val_loss"]))
plt.figure(figsize=(6,6), dpi=500)
plt.show()

## ANN - SGD

In [None]:
ann_sgd = tf.keras.models.Sequential()
ann_sgd.add(tf.keras.layers.Dense(units=16, activation='relu'))
ann_sgd.add(tf.keras.layers.Dense(units=32, activation='relu'))
ann_sgd.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))
ann_sgd.compile(optimizer = 'SGD', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [None]:
start = time.time()
sgd_history = ann_sgd.fit(X_train, y_train, batch_size = 16, epochs = 55, validation_data=(X_test,y_test))
sgd_time = time.time()-start

In [None]:
metrike([ann_sgd], X_test, y_test, [sgd_time])

In [None]:
plt.plot(pd.DataFrame(sgd_history.history["accuracy"]))
plt.plot(pd.DataFrame(sgd_history.history["val_accuracy"]))
plt.figure(figsize=(6,6), dpi=500)
plt.show()

plt.plot(pd.DataFrame(sgd_history.history["loss"]))
plt.plot(pd.DataFrame(sgd_history.history["val_loss"]))
plt.figure(figsize=(6,6), dpi=500)
plt.show()

## ANN - RMSprop

In [None]:
ann_rms = tf.keras.models.Sequential()
ann_rms.add(tf.keras.layers.Dense(units=16, activation='relu'))
ann_rms.add(tf.keras.layers.Dense(units=32, activation='relu'))
ann_rms.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))
ann_rms.compile(optimizer = 'rmsprop', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [None]:
start = time.time()
rms_history = ann_rms.fit(X_train, y_train, batch_size = 16, epochs = 23, validation_data=(X_test,y_test))
rms_time = time.time()-start

In [None]:
metrike([ann_rms], X_test, y_test, [rms_time])

In [None]:
plt.plot(pd.DataFrame(rms_history.history["accuracy"]))
plt.plot(pd.DataFrame(rms_history.history["val_accuracy"]))
plt.figure(figsize=(6,6), dpi=500)
plt.show()

plt.plot(pd.DataFrame(rms_history.history["loss"]))
plt.plot(pd.DataFrame(rms_history.history["val_loss"]))
plt.figure(figsize=(6,6), dpi=500)
plt.show()

In [None]:
models = [ann_adam,ann_adamax,ann_adagrad,ann_nadam,ann_sgd,ann_rms]
times = [adam_time,adamax_time,adagrad_time, nadam_time, sgd_time,rms_time]
histories = [adam_history,adamax_history,adagrad_history,nadam_history,sgd_history,rms_history]
metrike_table(models, X_test, y_test, times, histories)

In [None]:
for model in models:
    model.save(f'../saved_models/rain_tommorow/{type(model.optimizer).__name__}')