In [None]:
from google.colab import drive
drive.mount('/content/drive')

Import Statements

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay,confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from collections import Counter

In [None]:
cd /content/drive/MyDrive/Colab Notebooks/Data Analytics python

Load Datasets

In [None]:
data = pd.read_csv('Assignment3-WeatherData.csv')

unknown_data_cleaned = pd.read_csv('Assignment3-UnknownData.csv')

Data Preprocessing

In [None]:
#Checking the data before
print("Number of Rows Initially: " + str(data.shape[0]))
print("Number of Rows with Null Values: " + str(sum(data.isnull().values.ravel())))
print("Null values in each column: " + str(data.isnull().mean() * 100))

null_count = data.isnull().sum()

In [None]:
# Visualising null values using a bar chart
plt.figure(figsize=(10, 6))
null_count.plot(kind='bar')
plt.title('Number of Null Values per Column')
plt.xlabel('Columns')
plt.ylabel('Count of Null Values')
plt.show()

Cleaning Data (Removing Nulls)

In [None]:
numerical_columns = [
    'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
    'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm',
    'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm',
    'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm'
]

categorical_columns = ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']


#Copy of original dataframe for the cleaned version
data_cleaned = data.copy()

# Fill in numerical columns with the median of each column
for col in numerical_columns:
    data_cleaned[col] = data_cleaned[col].fillna(data_cleaned[col].median())

# Fill in categorical columns with the mode of each column
for col in categorical_columns:
    data_cleaned[col] = data_cleaned[col].fillna(data_cleaned[col].mode()[0])

print(data_cleaned.isnull().sum())

for col in numerical_columns:
    unknown_data_cleaned[col] = unknown_data_cleaned[col].fillna(unknown_data_cleaned[col].median())

for col in categorical_columns:
    unknown_data_cleaned[col] = unknown_data_cleaned[col].fillna(unknown_data_cleaned[col].mode()[0])

print(unknown_data_cleaned.isnull().sum())

unknown_data_cleaned.drop('row ID', axis=1, inplace=True)

In [None]:
data_cleaned

In [None]:
#Scaling numerical features

scaler = StandardScaler()

for col in numerical_columns:
  data_cleaned[col] = scaler.fit_transform(data_cleaned[[col]])

for col in numerical_columns:
  unknown_data_cleaned[col] = scaler.fit_transform(unknown_data_cleaned[[col]])

In [None]:
# Initialize encoders
le_rain_today = LabelEncoder()
encoder_gust_dir = OneHotEncoder(sparse_output=False)
encoder_wind_9am = OneHotEncoder(sparse_output=False)
encoder_wind_3pm = OneHotEncoder(sparse_output=False)
le_location = LabelEncoder()

# Encode RainToday column in data_cleaned
data_cleaned['RainToday'] = le_rain_today.fit_transform(data_cleaned['RainToday'])

# Frequency encoding for Location
data_cleaned['Location'] = le_location.fit_transform(data_cleaned['Location'])

# One-hot encode WindGustDir, WindDir9am, WindDir3pm in data_cleaned
wind_gust_df = pd.DataFrame(encoder_gust_dir.fit_transform(data_cleaned[['WindGustDir']]),
                            columns=encoder_gust_dir.get_feature_names_out(['WindGustDir']))
data_cleaned = pd.concat([data_cleaned, wind_gust_df], axis=1)
data_cleaned.drop('WindGustDir', axis=1, inplace=True)

wind_9am_df = pd.DataFrame(encoder_wind_9am.fit_transform(data_cleaned[['WindDir9am']]),
                           columns=encoder_wind_9am.get_feature_names_out(['WindDir9am']))
data_cleaned = pd.concat([data_cleaned, wind_9am_df], axis=1)
data_cleaned.drop('WindDir9am', axis=1, inplace=True)

wind_3pm_df = pd.DataFrame(encoder_wind_3pm.fit_transform(data_cleaned[['WindDir3pm']]),
                           columns=encoder_wind_3pm.get_feature_names_out(['WindDir3pm']))
data_cleaned = pd.concat([data_cleaned, wind_3pm_df], axis=1)
data_cleaned.drop('WindDir3pm', axis=1, inplace=True)

# Apply the same transformations to unknown_data_cleaned
unknown_data_cleaned['Location'] = le_location.transform(unknown_data_cleaned['Location'])
unknown_data_cleaned['RainToday'] = le_rain_today.transform(unknown_data_cleaned['RainToday'])

# One-hot encode WindGustDir, WindDir9am, WindDir3pm in unknown_data_cleaned
wind_gust_unknown_df = pd.DataFrame(encoder_gust_dir.transform(unknown_data_cleaned[['WindGustDir']]),
                                    columns=encoder_gust_dir.get_feature_names_out(['WindGustDir']))
unknown_data_cleaned = pd.concat([unknown_data_cleaned, wind_gust_unknown_df], axis=1)
unknown_data_cleaned.drop('WindGustDir', axis=1, inplace=True)

wind_9am_unknown_df = pd.DataFrame(encoder_wind_9am.transform(unknown_data_cleaned[['WindDir9am']]),
                                   columns=encoder_wind_9am.get_feature_names_out(['WindDir9am']))
unknown_data_cleaned = pd.concat([unknown_data_cleaned, wind_9am_unknown_df], axis=1)
unknown_data_cleaned.drop('WindDir9am', axis=1, inplace=True)

wind_3pm_unknown_df = pd.DataFrame(encoder_wind_3pm.transform(unknown_data_cleaned[['WindDir3pm']]),
                                   columns=encoder_wind_3pm.get_feature_names_out(['WindDir3pm']))
unknown_data_cleaned = pd.concat([unknown_data_cleaned, wind_3pm_unknown_df], axis=1)
unknown_data_cleaned.drop('WindDir3pm', axis=1, inplace=True)

# Check for nulls and confirm encoding
print(data_cleaned.isnull().sum())
print(unknown_data_cleaned.isnull().sum())

In [None]:
X = data_cleaned.drop('RainTomorrow', axis=1)
y = data_cleaned['RainTomorrow']

In [None]:
#Before balancing

class_counts = y.value_counts()
print(class_counts)

# Plot the counts
plt.figure(figsize=(8, 6))
class_counts.plot(kind='pie')
plt.title('Counts of 0s and 1s in RainTomorrow')
plt.xlabel('RainTomorrow (0 = No, 1 = Yes)')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

In [None]:
# Method 1
X = data_cleaned.drop('RainTomorrow', axis=1)
y = data_cleaned['RainTomorrow']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Separate majority and minority classes
majority_class = X_train[y_train == 0]
minority_class = X_train[y_train == 1]

# Downsample the majority class to 75% of its original size
majority_downsampled = resample(
    majority_class,
    replace=False,
    n_samples=int(len(minority_class) * 1.5),
    random_state=42
)

# Combine the downsampled majority class with the minority class
X_combined = pd.concat([majority_downsampled, minority_class])
y_combined = pd.concat([y_train[y_train == 0].iloc[:len(majority_downsampled)], y_train[y_train == 1]])

# Use SMOTE to balance the classes equally
smote = SMOTE(sampling_strategy= 'auto', random_state=42)
X_balanced, Y_balanced = smote.fit_resample(X_combined, y_combined)


# #Method 2
# minority_class = data_cleaned[data_cleaned['RainTomorrow'] == 1]
# majority_class = data_cleaned[data_cleaned['RainTomorrow'] == 0]

# # Downsample the majority class
# majority_downsampled = resample(majority_class, replace=False, n_samples=len(minority_class), random_state=42)

# # Combine the downsampled majority class with the minority class
# balanced_data = pd.concat([minority_class, majority_downsampled])

# X_balanced = balanced_data.drop('RainTomorrow', axis = 1)
# Y_balanced = balanced_data['RainTomorrow']

# # Check class distribution
# counter = Counter(Y_balanced)
# print("Class distribution:", counter)

# #Method 3
# from imblearn.under_sampling import RandomUnderSampler

# X = data_cleaned.drop('RainTomorrow', axis=1)
# y = data_cleaned['RainTomorrow']

# # Split data into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Check initial class distribution
# counter = Counter(y_train)
# print("Original training set class distribution:", counter)

# # Step 1: Downsample majority class to 1.25 times the minority class size
# majority_class_size = int(counter[1] * 2)
# under_sampler = RandomUnderSampler(sampling_strategy={0: majority_class_size, 1: counter[1]}, random_state=42)
# X_train_resampled, y_train_resampled = under_sampler.fit_resample(X_train, y_train)

# # Check distribution after downsampling
# print("Class distribution after downsampling:", Counter(y_train_resampled))

# # Step 2: Apply SMOTE to further balance classes
# smote = SMOTE(sampling_strategy=0.51, random_state=42)  # Creates synthetic samples until classes are 95% balanced
# X_balanced, y_balanced = smote.fit_resample(X_train_resampled, y_train_resampled)

# # Check final class distribution
# print("Final balanced class distribution:", Counter(y_balanced))

In [None]:
#After balancing
balanced_counts = Y_balanced.value_counts()
print(balanced_counts)

# Plot the counts
plt.figure(figsize=(8, 6))
balanced_counts.plot(kind='pie')
plt.title('Counts of 0s and 1s in RainTomorrow')
plt.xlabel('RainTomorrow (0 = No, 1 = Yes)')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

In [None]:
dt = DecisionTreeClassifier(criterion='gini',
    max_depth=5,
    min_samples_split = 2,
    min_samples_leaf=2,
    random_state=42
)
dt.fit(X_balanced, Y_balanced)

y_preds_dt = dt.predict(X_test)

print(classification_report(y_test, y_preds_dt))

fig, ax = plt.subplots(figsize = (5,4), dpi = 100)
cm = confusion_matrix(y_test, y_preds_dt)
cmp = ConfusionMatrixDisplay(cm, display_labels = ["Negative", "Positive"])
cmp.plot(ax=ax)

In [None]:
# Initialize KNN model
knn = KNeighborsClassifier(
    n_neighbors=25,
    weights='distance',
    metric='manhattan',
    p=1
)

knn.fit(X_balanced, Y_balanced)

y_pred_knn = knn.predict(X_test)

print(classification_report(y_test, y_pred_knn))

fig, ax = plt.subplots(figsize = (5,4), dpi = 100)
cm = confusion_matrix(y_test, y_pred_knn)
cmp = ConfusionMatrixDisplay(cm, display_labels = ["Negative", "Positive"])
cmp.plot(ax=ax)

In [None]:
rff = RandomForestClassifier(
    n_estimators = 400,
    criterion='entropy',
    random_state = 42
)

rff.fit(X_balanced, Y_balanced)
y_preds_rff = rff.predict(X_test)
print(classification_report(y_test, y_preds_rff))

fig, ax = plt.subplots(figsize = (5,4), dpi = 100)
cm = confusion_matrix(y_test, y_preds_rff)
cmp = ConfusionMatrixDisplay(cm, display_labels = ["Negative", "Positive"])
cmp.plot(ax=ax)

In [None]:
svc = SVC(C=10, kernel='rbf', gamma='scale', random_state=42)

svc.fit(X_balanced, Y_balanced)
y_preds_svc = svc.predict(X_test)

print(classification_report(y_test, y_preds_svc))

fig, ax = plt.subplots(figsize = (5,4), dpi = 100)
cm = confusion_matrix(y_test, y_preds_svc)
cmp = ConfusionMatrixDisplay(cm, display_labels = ["Negative", "Positive"])
cmp.plot(ax=ax)

In [None]:
# Initialize the Perceptron
pt = MLPClassifier(hidden_layer_sizes=(200,100,50,25),
                    activation='relu',
                    solver="sgd",
                    alpha=0.00001,
                    batch_size='auto',
                    learning_rate='adaptive',
                    learning_rate_init=0.0001,
                    max_iter=10000,
                    shuffle=True,
                    random_state=42,
                    early_stopping=True)

pt.fit(X_balanced, Y_balanced)

y_pred_pt = pt.predict(X_test)

print(classification_report(y_test, y_pred_pt))

fig, ax = plt.subplots(figsize = (5,4), dpi = 100)
cm = confusion_matrix(y_test, y_pred_pt)
cmp = ConfusionMatrixDisplay(cm, display_labels = ["Negative", "Positive"])
cmp.plot(ax=ax)