In [None]:
# Importing Data
import pandas as pd

ds = pd.read_csv("weatherAUS.csv")

In [None]:
# Handling Missing Value
ds.fillna(ds.mean(), inplace = True)
ds = ds.dropna()
ds.isnull().sum().sum()

In [None]:
# Showing Top 10 of Data
ds.head(30)

In [None]:
# Showing Statistics Description
ds.describe(include='all')

In [None]:
# Variance and Standard Deviation of MinTemp
min_temp_var = ds.loc[:, "MinTemp"].var()
min_temp_std = ds.loc[:, "MinTemp"].std()
print(min_temp_var)
print(min_temp_std)

In [None]:
# Profiling Dataset
from pandas_profiling import ProfileReport

profile = ProfileReport(ds, "Profile for Weather Dataset", explorative=True)
profile

In [None]:
# Making New Column
import datetime

ds['Month'] = ds['Date'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d").strftime('%Y-%m'))
ds['Year'] = ds['Date'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d").strftime('%Y'))

In [None]:
# Visualize Between Date and MinTemp
import matplotlib.pyplot as plt

plt.figure(figsize=(13, 8))
ds.groupby(['Year'])['MinTemp'].sum().plot(marker='.')
plt.title("Date and Minimun Temperature in Australia", fontsize=15)
plt.xlabel("Date", fontsize=10)
plt.ylabel("Minimum Temperature", fontsize=10)
plt.ylim(ymax=200000)
labels, locations = plt.yticks()
plt.yticks(labels, (labels/1000).astype(int))
plt.annotate('Temperature minimum tertinggi', xy=(9, 196000), xytext=(5, 125000),
             weight='bold', color='red', arrowprops=dict(arrowstyle='fancy',
                                                         connectionstyle="arc3",
                                                         color='red'))
plt.grid(color='darkgray')
plt.show()

In [None]:
# Encoding Label
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()

ds['WindGustDir'] = LE.fit_transform(ds['WindGustDir'])
ds['RainToday'] = LE.fit_transform(ds['RainToday'])

In [None]:
# Rescale Data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler_column = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed']
ds[scaler_column] = scaler.fit_transform(ds[scaler_column])

In [None]:
# Building Training and Testing Dataset
from sklearn.model_selection import train_test_split

X = ds[['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'RainToday']]
y = ds['RainTomorrow']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [None]:
# Visualize Features Dataset Using Rank Features
import matplotlib.pyplot as plt
from yellowbrick.features import rank1d, rank2d

y, axes = plt.subplots(ncols = 2, figsize = (15, 8))

rank1d(X, ax = axes[0], show = False)
rank2d(X, ax = axes[1], show = False)
plt.show()

In [None]:
# Visualize Features Using Parallel Coordinates
from yellowbrick.features import ParallelCoordinates

features = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'RainToday']
classes = ["Yes", "No"]

vis = ParallelCoordinates(classes=classes, features=features, sample=0.1, shuffle=True)

vis.fit_transform(X, y)
vis.show()

In [None]:
# Visualize Features Using Biplot PCA
from sklearn.preprocessing import LabelEncoder
from yellowbrick.features import PCA

y_encode = LabelEncoder().fit_transform(y)

vis = PCA(scale=True, proj_features=True)
vis.fit_transform(X, y_encode)
vis.show(outpath="PCA.png")

In [None]:
# Visualize Class Balance
from yellowbrick.target import ClassBalance

vis = ClassBalance(labels=["Yes", "No"])

vis.fit(y_train, y_test)
vis.show()

In [None]:
# Build Machine Learning Model
# 1. Logistic Regression
from sklearn.linear_model import LogisticRegression

lg = LogisticRegression()
lg = lg.fit(X_train, y_train)

y_pred_lg = lg.predict(X_test)

In [None]:
# 2. K Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier

KNC = KNeighborsClassifier()
KNC = KNC.fit(X_train, y_train)

y_pred_KNC = KNC.predict(X_test)

In [None]:
# 3. Decision Tree Model
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc = dtc.fit(X_train, y_train)

y_pred_dtc = dtc.predict(X_test)

In [None]:
# 4. Random Forest
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_jobs = 2)
rfc = rfc.fit(X_train, y_train)

y_pred_rfc = rfc.predict(X_test)

In [None]:
# 5. Support Vector Machine
from sklearn.svm import SVC

svc = SVC(kernel = "linear", C = 1)
svc = svc.fit(X_train, y_train)

y_pred_svc = svc.predict(X_test)

In [None]:
# Checking Accuracy Score For Each Model
# For Logistic Regression Model
print("Logistic Regression model score for Training dataset:", lg.score(X_train, y_train))
print("Logistic Regression model score for Testing dataset:", lg.score(X_test, y_test))
print("")

# For K Nearest Neighbors Model
print("K Nearest Neighbors model score for Training dataset:", KNC.score(X_train, y_train))
print("K Nearest Neighbors model score for Testing dataset:", KNC.score(X_test, y_test))
print("")

# For Decision Tree Model
print("Decision Tree model score for Training dataset:", dtc.score(X_train, y_train))
print("Decision Tree model score for Testing dataset:", dtc.score(X_test, y_test))
print("")

# For Random Forest Model
print("Random Forest model score for Training dataset:", rfc.score(X_train, y_train))
print("Random Forest model score for Testing dataset:", rfc.score(X_test, y_test))
print("")

# For Support Vector Machine Model
print("Supprot Vector Machine (Clasification) model score for Training dataset:", svc.score(X_train, y_train))
print("Supprot Vector Machine (Clasification) model score for Testing dataset:", svc.score(X_test, y_test))

In [None]:
# Visualize Confusion Matrix
from yellowbrick.classifier import ConfusionMatrix

cm = ConfusionMatrix(rfc, classes = ["Yes", "No"])
cm.fit(X_train, y_train)
cm.score(X_test, y_test)
cm.show()

In [None]:
# Visualize Classification Report
from yellowbrick.classifier import ClassificationReport

cr = ClassificationReport(rfc, classes = ["Yes", "No"], support = True)
cr.fit(X_train, y_train)
cr.score(X_test, y_test)
cr.show()

In [None]:
# Visualize Error Analysis
from yellowbrick.classifier import ClassPredictionError

vis = ClassPredictionError(rfc)

vis.fit(X_train, y_train)
vis.score(X_test, y_test)
vis.show()

In [None]:
# Visualize The Random Forest Tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

rfc_vis = RandomForestClassifier(n_jobs=2, max_depth=3)
rfc_vis = rfc_vis.fit(X_train, y_train)

plt.rcParams['figure.figsize'] = (15, 13)
_ = plot_tree(rfc_vis.estimators_[0], feature_names=X.columns, filled=True)

In [None]:
# Predict New Dataset Using The Best Model (Radom Forest)
new_dataset = [[0.4, 0.6, 0.9, 0.73, 0.45, 0.54, 0],
                [0.5, 0.56, 0.3, 0.5, 0.8, 0.55, 1]]

rfc.predict(new_dataset)

In [None]:
# Visualize Feature Importances
from yellowbrick.model_selection import FeatureImportances

vis = FeatureImportances(rfc)

vis.fit(X, y)
vis.show("Feature-Importances.png")