In [78]:
# Necessary Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

In [79]:
# Load the training data
train_data = pd.read_csv("Weather Training Data.csv")
train_data.describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainTomorrow
count,99073.0,99286.0,98537.0,56985.0,52199.0,93036.0,98581.0,97681.0,98283.0,97010.0,89768.0,89780.0,61944.0,59514.0,98902.0,97612.0,99516.0
mean,12.176266,23.218513,2.353024,5.46132,7.61509,39.976966,14.004849,18.650464,68.866376,51.433296,1017.684638,1015.286204,4.447985,4.519122,16.970041,21.68134,0.224677
std,6.390882,7.115072,8.487866,4.16249,3.783008,13.581524,8.902323,8.801827,19.074951,20.777616,7.110166,7.045189,2.88658,2.716618,6.488961,6.931681,0.417372
min,-8.5,-4.1,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,980.5,978.2,0.0,0.0,-7.0,-5.1,0.0
25%,7.6,17.9,0.0,2.6,4.8,31.0,7.0,13.0,57.0,37.0,1013.0,1010.5,1.0,2.0,12.3,16.6,0.0
50%,12.0,22.6,0.0,4.8,8.4,39.0,13.0,19.0,70.0,52.0,1017.7,1015.3,5.0,5.0,16.7,21.1,0.0
75%,16.8,28.2,0.8,7.4,10.6,48.0,19.0,24.0,83.0,65.0,1022.4,1020.0,7.0,7.0,21.5,26.4,0.0
max,33.9,48.1,371.0,86.2,14.5,135.0,130.0,87.0,100.0,100.0,1041.0,1039.6,9.0,9.0,40.2,46.7,1.0


In [80]:
train_data.head(10)

Unnamed: 0,row ID,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,Row0,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0
1,Row1,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,0
2,Row2,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0
3,Row3,Albury,14.6,29.7,0.2,,,WNW,56.0,W,...,55.0,23.0,1009.2,1005.4,,,20.6,28.9,No,0
4,Row4,Albury,7.7,26.7,0.0,,,W,35.0,SSE,...,48.0,19.0,1013.4,1010.1,,,16.3,25.5,No,0
5,Row5,Albury,13.1,30.1,1.4,,,W,28.0,S,...,58.0,27.0,1007.0,1005.7,,,20.1,28.2,Yes,0
6,Row6,Albury,13.4,30.4,0.0,,,N,30.0,SSE,...,48.0,22.0,1011.8,1008.7,,,20.4,28.8,No,1
7,Row7,Albury,15.9,21.7,2.2,,,NNE,31.0,NE,...,89.0,91.0,1010.5,1004.2,8.0,8.0,15.9,17.0,Yes,1
8,Row8,Albury,12.6,21.0,3.6,,,SW,44.0,W,...,65.0,43.0,1001.2,1001.8,,7.0,15.8,19.8,Yes,0
9,Row9,Albury,9.8,27.7,,,,WNW,50.0,,...,50.0,28.0,1013.4,1010.3,0.0,,17.3,26.2,,0


In [81]:
train_data.isnull().sum()

row ID               0
Location             0
MinTemp            443
MaxTemp            230
Rainfall           979
Evaporation      42531
Sunshine         47317
WindGustDir       6521
WindGustSpeed     6480
WindDir9am        7006
WindDir3pm        2648
WindSpeed9am       935
WindSpeed3pm      1835
Humidity9am       1233
Humidity3pm       2506
Pressure9am       9748
Pressure3pm       9736
Cloud9am         37572
Cloud3pm         40002
Temp9am            614
Temp3pm           1904
RainToday          979
RainTomorrow         0
dtype: int64

In [82]:
# Target and features
y = train_data["RainTomorrow"]
X = train_data.drop(columns=["RainTomorrow"])

In [83]:
# Identify categorical and numerical columns
low_cardinality_cols = [col for col in X.columns if X[col].nunique() < 10 and X[col].dtype == "object"]
numerical_cols = [col for col in X.columns if X[col].dtype in ["int64", "float64"]]

my_cols = low_cardinality_cols + numerical_cols
X = X[my_cols].copy()

In [84]:
low_cardinality_cols

['RainToday']

In [85]:
# Define transformers for preprocessing
numerical_transformer = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combine transformers into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_cols),
        ("cat", categorical_transformer, low_cardinality_cols),
    ]
)

In [86]:
# Define the model and pipeline
my_model = XGBClassifier(n_estimators=100, random_state=42, learning_rate=0.08, n_jobs=5)

my_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", my_model)
])

# Cross-validation accuracy
cv_scores = cross_val_score(my_pipeline, X, y, cv=20, scoring='accuracy')
mean_accuracy = cv_scores.mean()

print(f"Cross-validated Accuracy: {mean_accuracy:.4f} ({mean_accuracy * 100:.2f}%)")

Cross-validated Accuracy: 0.8427 (84.27%)


In [87]:
# Load test data and apply same feature engineering
test_data = pd.read_csv("Weather Test Data.csv")
X_test = test_data[my_cols].copy()

my_pipeline.fit(X, y)
preds = my_pipeline.predict(X_test)

submission = pd.DataFrame({
    "row ID": test_data["row ID"],
    "RainTomorrow": preds
})
submission.to_csv("submission.csv", index=False)
submission

Unnamed: 0,row ID,RainTomorrow
0,Row0,0
1,Row1,0
2,Row2,0
3,Row3,0
4,Row4,1
...,...,...
42672,Row43633,0
42673,Row43634,0
42674,Row43635,0
42675,Row43636,0
