# Imports

In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

RANDOM_STATE = 4200

## Read the dataset

In [6]:
df = pd.read_csv('../HKO-Weather-Data-Interpolated.csv')

In [7]:
df.describe()

Unnamed: 0,Year,Month,Day,MeanCloudAmount,MeanPressure,TotalEvaporation,TotalRainfall,MeanHumidity,MinTemperature,MeanTemperature,MaxTemperature,MeanUVIndex,TotalSunshine,MeanWindSpeed,IsExtreme
count,8766.0,8766.0,8766.0,8766.0,8766.0,8766.0,8766.0,8766.0,8766.0,8766.0,8766.0,8766.0,8766.0,8766.0,8766.0
mean,2011.498973,6.52293,15.729637,68.280972,1012.820135,3.349139,5.552932,78.080995,19.558191,22.884645,27.226586,2.880966,5.126717,7.180259,0.058293
std,6.922631,3.4489,8.800594,23.59828,6.618001,1.654657,18.054127,10.673137,5.879555,5.48277,5.682727,1.528562,3.881934,3.248124,0.234311
min,2000.0,1.0,1.0,0.0,985.6,0.0,0.0,21.0,-0.9,3.6,5.3,0.1,0.0,0.7,0.0
25%,2005.25,4.0,8.0,54.0,1007.8,2.2,0.0,73.0,15.5,18.8,23.4,2.0,1.0,4.8,0.0
50%,2011.5,7.0,16.0,78.0,1012.7,3.2,0.0,79.0,21.1,24.2,28.2,3.0,5.2,6.5,0.0
75%,2017.0,10.0,23.0,87.0,1017.9,4.5,1.5,85.0,24.6,27.6,31.9,4.0,8.9,9.0,0.0
max,2023.0,12.0,31.0,100.0,1035.8,11.8,340.5,99.0,28.9,32.3,38.5,7.0,12.4,31.7,1.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8735 entries, 0 to 8734
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Year              8735 non-null   int64  
 1   Month             8735 non-null   int64  
 2   Day               8735 non-null   int64  
 3   Date              8735 non-null   object 
 4   MeanCloudAmount   8735 non-null   int64  
 5   MeanPressure      8708 non-null   float64
 6   TotalEvaporation  8665 non-null   float64
 7   TotalRainfall     8552 non-null   float64
 8   MeanHumidity      8695 non-null   float64
 9   MinTemperature    8719 non-null   float64
 10  MeanTemperature   8718 non-null   float64
 11  MaxTemperature    8719 non-null   float64
 12  MeanUVIndex       8729 non-null   float64
 13  TotalSunshine     8735 non-null   float64
 14  MeanWindSpeed     8731 non-null   float64
 15  IsExtreme         8735 non-null   int64  
dtypes: float64(10), int64(5), object(1)
memory

## Drop Null Values

In [5]:
df.dropna(inplace=True)

## Assign X and y

In [6]:
X = df[['MeanCloudAmount', 'MeanPressure',
        'TotalEvaporation', 'TotalRainfall', 'MeanHumidity', 
        'MinTemperature', 'MeanTemperature', 'MaxTemperature', 
        'MeanUVIndex', 'TotalSunshine', 'MeanWindSpeed']]
y = df['IsExtreme']

## Train Test Split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

## Run PCA

In [8]:
pca = PCA(n_components=len(X.columns))
pca.fit(X)
pca.explained_variance_ratio_

array([5.67492663e-01, 2.41564700e-01, 1.23714759e-01, 4.67045397e-02,
       7.69715996e-03, 6.54930877e-03, 3.80562831e-03, 1.25431915e-03,
       7.29864736e-04, 3.65385586e-04, 1.21671733e-04])

## Examine the distribution of the output class

In [9]:
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X, y)
dummy.score(X, y)

0.9439971584181861

## Run a model to examine the results

In [10]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [11]:
# model 1
# model = LogisticRegressionCV(scoring='recall', class_weight='balanced', 
#                                 n_jobs=-1, max_iter=1000, random_state=RANDOM_STATE)
# model 2
# rf = RandomForestClassifier(n_jobs=-1, random_state=RANDOM_STATE)
# model = RandomizedSearchCV(rf, {'max_depth': [64, 128, None],
#                                 'class_weight': [{0: 1, 1: 10}, 'balanced']}, 
#                                 scoring='recall')
# model 3
gb = GradientBoostingClassifier(random_state=RANDOM_STATE)
model = RandomizedSearchCV(gb, {'learning_rate': [1e-4, 1e-3, 1e-2, 1e-1], 
                                'max_depth': [3, 10, None],
                                'tol': [1e-5, 5e-5, 1e-4]}, 
                                scoring='recall')

In [12]:
model.fit(X_train_scaled, y_train)

In [13]:
# recall
X_test_scaled = scaler.transform(X_test)
model.score(X_test_scaled, y_test)

0.47115384615384615

In [14]:
# accuracy, balanced accuracy and roc/auc score
y_pred = model.predict(X_test_scaled)
print(accuracy_score(y_test, y_pred))
print(balanced_accuracy_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred))

0.9568047337278106
0.7299022698612863
0.7299022698612863


In [15]:
# confusion matrix
print(confusion_matrix(y_test, y_pred))

[[1568   18]
 [  55   49]]
