In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter('ignore')

In [35]:
%matplotlib inline

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [3]:
df = pd.read_excel('WeatherStationData_StasiunSoekarnoHatta.xlsx')
df.head()

Unnamed: 0,Day,Month,Year,minTemperature,maxTemperature,Temperature,Humidity,Rainfall,WindSpeed,WindDirection
0,1,1,1988,24.2,30.8,26.9,85.0,9.0,4,360.0
1,2,1,1988,24.6,30.4,26.7,85.0,0.0,4,315.0
2,3,1,1988,23.0,32.4,27.4,81.0,0.0,9,315.0
3,4,1,1988,24.3,32.2,27.6,81.0,2.0,6,360.0
4,5,1,1988,24.3,31.2,26.7,86.0,6.0,6,315.0


In [8]:
df.dtypes

Day                 int64
Month               int64
Year                int64
minTemperature    float64
maxTemperature    float64
Temperature       float64
Humidity          float64
Rainfall          float64
WindSpeed           int64
WindDirection     float64
dtype: object

In [19]:
df['Rainfall'] = df['Rainfall'].replace(8888, np.nan)    # 8888 means the missing values

In [30]:
df = df.interpolate(method='akima')     # Interpolate the missing values

In [42]:
df[df['Rainfall'].isnull()]     # Checking the Rainfall feature for the missing values

Unnamed: 0,Day,Month,Year,minTemperature,maxTemperature,Temperature,Humidity,Rainfall,WindSpeed,WindDirection


In [45]:
rainClasses_arr = np.array([])
for i in range(0,df['Rainfall'].values.size):
    if (df['Rainfall'].values[i] > 5):
        rainClasses_arr = np.append(rainClasses_arr, 'rain')
    else:
        rainClasses_arr = np.append(rainClasses_arr, 'no rain')

In [46]:
rainClasses_arr

array(['rain', 'no rain', 'no rain', ..., 'no rain', 'no rain', 'no rain'],
      dtype='<U32')

In [49]:
Weather_new_data = np.concatenate((df.values, rainClasses_arr.reshape(df.shape[0],1)), axis=1) # add features label array
Weather_new_data.shape

(4749, 11)

In [50]:
Weather_new_features = np.append(np.asarray(df.columns, dtype='str'), 'RainClass')
Weather_new_features

array(['Day', 'Month', 'Year', 'minTemperature', 'maxTemperature',
       'Temperature', 'Humidity', 'Rainfall', 'WindSpeed',
       'WindDirection', 'RainClass'], dtype='<U64')

In [52]:
new_Weather_df = pd.DataFrame(Weather_new_data, columns=Weather_new_features)
new_Weather_df = new_Weather_df.drop(columns=['Rainfall'])
new_Weather_df.head()

Unnamed: 0,Day,Month,Year,minTemperature,maxTemperature,Temperature,Humidity,WindSpeed,WindDirection,RainClass
0,1.0,1.0,1988.0,24.2,30.8,26.9,85.0,4.0,360.0,rain
1,2.0,1.0,1988.0,24.6,30.4,26.7,85.0,4.0,315.0,no rain
2,3.0,1.0,1988.0,23.0,32.4,27.4,81.0,9.0,315.0,no rain
3,4.0,1.0,1988.0,24.3,32.2,27.6,81.0,6.0,360.0,no rain
4,5.0,1.0,1988.0,24.3,31.2,26.7,86.0,6.0,315.0,rain


In [56]:
new_Weather_df[new_Weather_df.columns[0:9]] = new_Weather_df[new_Weather_df.columns[0:9]].astype('float64')
new_Weather_df.dtypes

Day               float64
Month             float64
Year              float64
minTemperature    float64
maxTemperature    float64
Temperature       float64
Humidity          float64
WindSpeed         float64
WindDirection     float64
RainClass          object
dtype: object

In [59]:
new_Weather_df['RainClass'].value_counts()

no rain    3854
rain        895
Name: RainClass, dtype: int64

In [60]:
# encode the labels, converting them from strings to integers
le = LabelEncoder()
labels = le.fit_transform(new_Weather_df['RainClass'].values)
labels

array([1, 0, 0, ..., 0, 0, 0])

In [66]:
# perform a training testing split, using 75% of the data for
# training and 25% for evaluation
(trainX, testX, trainY, testY) = train_test_split(new_Weather_df[new_Weather_df.columns[0:9]].values, labels, random_state=3, test_size=0.25)

In [68]:
# define the dictionary of models our script can use
# the key to the dictionary is the name of the model
# (supplied via command line argument) and the value is the model itself
models = {
    "knn": KNeighborsClassifier(n_neighbors=1),
    "naive_bayes": GaussianNB(),
    "logit": LogisticRegression(solver="lbfgs", multi_class="auto"),
    "svm": SVC(kernel="linear", gamma="auto"),
    "decision_tree": DecisionTreeClassifier(),
    "random_forest": RandomForestClassifier(n_estimators=100),
    'mlp': MLPClassifier()
}

In [71]:
# train the K-Neighbors model
print("[INFO] using '{}' model".format("knn"))
model = models["knn"]
model.fit(trainX, trainY)
# make predictions on our data and show a accuracy score
print("[INFO] evaluating...")
predictions = model.predict(testX)
KNN_accur = accuracy_score(testY, predictions)
print('\nSkor akurasi kemampuan model K-Neighbors dalam mengklasifikasi Hujan adalah:', KNN_accur)

[INFO] using 'knn' model
[INFO] evaluating...

Skor akurasi kemampuan model K-Neighbors dalam mengklasifikasi Hujan adalah: 0.7752525252525253


In [72]:
# train the Naive Bayes model
print("[INFO] using '{}' model".format("naive_bayes"))
model = models["naive_bayes"]
model.fit(trainX, trainY)
# make predictions on our data and show a accuracy score
print("[INFO] evaluating...")
predictions = model.predict(testX)
NB_accur = accuracy_score(testY, predictions)
print('\nSkor akurasi kemampuan model Naive Bayes dalam mengklasifikasi Hujan adalah:', NB_accur)

[INFO] using 'naive_bayes' model
[INFO] evaluating...

Skor akurasi kemampuan model Naive Bayes dalam mengklasifikasi Hujan adalah: 0.7920875420875421


In [76]:
# train the Logistic Regression model
print("[INFO] using '{}' model".format("logit"))
model = models["logit"]
model.fit(trainX, trainY)
# make predictions on our data and show a accuracy score
print("[INFO] evaluating...")
predictions = model.predict(testX)
Logit_accur = accuracy_score(testY, predictions)
print('\nSkor akurasi kemampuan model Logistic Regression dalam mengklasifikasi Hujan adalah:', Logit_accur)

[INFO] using 'logit' model
[INFO] evaluating...

Skor akurasi kemampuan model Logistic Regression dalam mengklasifikasi Hujan adalah: 0.8240740740740741


In [78]:
# train the Support Vector Machine model
print("[INFO] using '{}' model".format("svm"))
model = models["svm"]
model.fit(trainX, trainY)
# make predictions on our data and show a accuracy score
print("[INFO] evaluating...")
predictions = model.predict(testX)
SVM_accur = accuracy_score(testY, predictions)
print('\nSkor akurasi kemampuan model Support Vector Machine dalam mengklasifikasi Hujan adalah:', SVM_accur)

[INFO] using 'svm' model
[INFO] evaluating...

Skor akurasi kemampuan model Support Vector Machine dalam mengklasifikasi Hujan adalah: 0.8341750841750841


In [79]:
# train the Decision Tree model
print("[INFO] using '{}' model".format("decision_tree"))
model = models["decision_tree"]
model.fit(trainX, trainY)
# make predictions on our data and show a accuracy score
print("[INFO] evaluating...")
predictions = model.predict(testX)
DT_accur = accuracy_score(testY, predictions)
print('\nSkor akurasi kemampuan model Decision Tree dalam mengklasifikasi Hujan adalah:', DT_accur)

[INFO] using 'decision_tree' model
[INFO] evaluating...

Skor akurasi kemampuan model Decision Tree dalam mengklasifikasi Hujan adalah: 0.7651515151515151


In [91]:
# train the Random Forest model
print("[INFO] using '{}' model".format("random_forest"))
model = models["random_forest"]
model.fit(trainX, trainY)
# make predictions on our data and show a accuracy score
print("[INFO] evaluating...")
predictions = model.predict(testX)
RF_accur = accuracy_score(testY, predictions)
print('\nSkor akurasi kemampuan model Random Forest dalam mengklasifikasi Hujan adalah:', RF_accur)

[INFO] using 'random_forest' model
[INFO] evaluating...

Skor akurasi kemampuan model Random Forest dalam mengklasifikasi Hujan adalah: 0.8383838383838383


In [101]:
# train the MLP model
print("[INFO] using '{}' model".format("mlp"))
model = models["mlp"]
model.fit(trainX, trainY)
# make predictions on our data and show a accuracy score
print("[INFO] evaluating...")
predictions = model.predict(testX)
MLP_accur = accuracy_score(testY, predictions)
print('\nSkor akurasi kemampuan model MLP dalam mengklasifikasi Hujan adalah:', MLP_accur)

[INFO] using 'mlp' model
[INFO] evaluating...

Skor akurasi kemampuan model MLP dalam mengklasifikasi Hujan adalah: 0.8316498316498316


In [102]:
score_obj = dict(zip(['Accuracy Score'], [[KNN_accur,NB_accur,Logit_accur,SVM_accur,DT_accur,RF_accur,MLP_accur]]))

score_df = pd.DataFrame(score_obj, index=['K-Neighbors','Naive Bayes','Logistic Regression','SVM','Decision Tree', 'Random Forest', 'MLP'])
score_df

Unnamed: 0,Accuracy Score
K-Neighbors,0.775253
Naive Bayes,0.792088
Logistic Regression,0.824074
SVM,0.834175
Decision Tree,0.765152
Random Forest,0.838384
MLP,0.83165


In [124]:
# 31th July 2019's Rain Predict

le.classes_[model.predict(np.array([31., 7., 2019., 17., 26., 23.4, 73., 5., 300.]).reshape(1,-1))][0]

'no rain'