Simple Linear Regression (Humidity9am → Rainfall)
MSE: 1.5253551537449668
R² Score: 0.09107438688803127
Multiple Linear Regression (Top Features → Rainfall)
MSE: 9.409522142573474e-29
R² Score: 1.0
Logistic Regression
Accuracy: 0.9999258627719909
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     16588
         1.0       1.00      1.00      1.00     10389

    accuracy                           1.00     26977
   macro avg       1.00      1.00      1.00     26977
weighted avg       1.00      1.00      1.00     26977
Random Forest
Accuracy: 1.0
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     16588
         1.0       1.00      1.00      1.00     10389

    accuracy                           1.00     26977
   macro avg       1.00      1.00      1.00     26977
weighted avg       1.00      1.00      1.00     26977
 SVM
Accuracy: 0.9946621195833488
              precision    recall  f1-score   support

         0.0       1.00      0.99      1.00     16588
         1.0       0.99      1.00      0.99     10389

    accuracy                           0.99     26977
   macro avg       0.99      1.00      0.99     26977
weighted avg       0.99      0.99      0.99     26977
Decision Tree
Accuracy: 1.0
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     16588
         1.0       1.00      1.00      1.00     10389

    accuracy                           1.00     26977
   macro avg       1.00      1.00      1.00     26977
weighted avg       1.00      1.00      1.00     26977
Neural Network
Accuracy: 1.0
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     16588
         1.0       1.00      1.00      1.00     10389

    accuracy                           1.00     26977
   macro avg       1.00      1.00      1.00     26977
weighted avg       1.00      1.00      1.00     26977


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# 1. Load dataset
dataset = pd.read_csv('Extracted-dataset.csv')
dataset.head()
# 2. Encode binary categorical labels
dataset['RainToday'] = dataset['RainToday'].map({'No': 0, 'Yes': 1})     # .map() is used to convert categorical values to numerical values .It is better than replace as it avoids the warning and sets the correct data type.
dataset['RainTomorrow'] = dataset['RainTomorrow'].map({'No': 0, 'Yes': 1})
dataset.head()
# 3. Impute object/categorical columns with mode
for col in dataset.select_dtypes(include=['object']).columns:
    dataset[col] = dataset[col].fillna(dataset[col].mode()[0])
    from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# 4. Label Encode categorical columns
lencoders = {}
for col in dataset.select_dtypes(include=['object']).columns:
    lencoders[col] = LabelEncoder()
    dataset[col] = lencoders[col].fit_transform(dataset[col])

# 5. Impute numerical columns using Iterative Imputer (MICE)
mice_imputer = IterativeImputer()
MiceImputed = pd.DataFrame(mice_imputer.fit_transform(dataset), columns=dataset.columns)
dataset.head()
# 6. Split classes for oversampling BEFORE removing outliers
no = MiceImputed[MiceImputed['RainTomorrow'] == 0]
yes = MiceImputed[MiceImputed['RainTomorrow'] == 1]
print("Before Oversampling:", MiceImputed['RainTomorrow'].value_counts())

fig = plt.figure(figsize = (8,5))
MiceImputed.RainTomorrow.value_counts(normalize = True).plot(kind='bar', color= ['skyblue','navy'], alpha = 0.9, rot=0)  
plt.title('RainTomorrow Indicator No(0) and Yes(1) in the Imbalanced Dataset')
plt.show()
# Oversample the minority class
from sklearn.utils import resample
if len(yes) > 0:
    yes_oversampled = resample(yes, replace=True, n_samples=len(no), random_state=123)
    oversampled = pd.concat([no, yes_oversampled])
    print("After Oversampling:", oversampled['RainTomorrow'].value_counts())
else:
    print("⚠️ No samples with RainTomorrow == 1 — oversampling not possible.")
    oversampled = MiceImputed.copy()

fig = plt.figure(figsize = (8,5))
oversampled.RainTomorrow.value_counts(normalize = True).plot(kind='bar', color= ['skyblue','navy'], alpha = 0.9, rot=0)
plt.title('RainTomorrow Indicator No(0) and Yes(1) after Oversampling (Balanced Dataset)')
plt.show()
# 7. Now remove outliers (after balancing)
Q1 = oversampled.quantile(0.25)
Q3 = oversampled.quantile(0.75)
IQR = Q3 - Q1
cleaned = oversampled[~((oversampled < (Q1 - 1.5 * IQR)) | (oversampled > (Q3 + 1.5 * IQR))).any(axis=1)]

print("Final shape after outlier removal:", cleaned.shape)
print("Final class distribution:\n", cleaned['RainTomorrow'].value_counts(normalize=True))
# 8. Visualize class balance
plt.figure(figsize=(8,5))
cleaned['RainTomorrow'].value_counts(normalize=True).plot(kind='bar', color=['skyblue', 'navy'])
plt.title('Balanced Dataset After Cleaning & Outlier Removal')
plt.xlabel('RainTomorrow (0 = No, 1 = Yes)')
plt.ylabel('Proportion')
plt.show()
# 9. Optional: Heatmap of missing values (should be empty)
sns.heatmap(cleaned.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap (Should Be Clean)')
plt.show()
# 10. Pearson Correlation with RainTomorrow
numeric_df = cleaned.select_dtypes(include=['float64', 'int64'])

correlation_with_target = numeric_df.corr()['RainTomorrow'].sort_values(ascending=False)
print(correlation_with_target)
# 11. Barplot of Correlation
sns.barplot(
    x=correlation_with_target.drop('RainTomorrow').values,
    y=correlation_with_target.drop('RainTomorrow').index                #  .index gives us the row labels — which, in this case, are the names of the features
)


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay,
    roc_curve, auc, mean_squared_error, r2_score
)
import joblib

# === 1. Feature Selection ===
correlation_with_target = cleaned.corr()['RainTomorrow'].sort_values(ascending=False)
top_features = correlation_with_target[abs(correlation_with_target) > 0.1].drop('RainTomorrow').index.tolist()
print("Selected Top Features:", top_features)

# === 2. Simple Linear Regression: Humidity9am → Rainfall ===
print("\n🔹 Simple Linear Regression (Humidity9am → Rainfall)")
X_lin = cleaned[['Humidity9am']]
y_lin = cleaned['Rainfall']

X_train_lin, X_test_lin, y_train_lin, y_test_lin = train_test_split(X_lin, y_lin, test_size=0.2, random_state=42)

lin_model = LinearRegression()
lin_model.fit(X_train_lin, y_train_lin)
y_pred_lin = lin_model.predict(X_test_lin)

print("MSE:", mean_squared_error(y_test_lin, y_pred_lin))
print("R² Score:", r2_score(y_test_lin, y_pred_lin))

# Plot regression line
plt.figure(figsize=(8, 5))
plt.scatter(X_test_lin, y_test_lin, color='blue', label='Actual')
plt.plot(X_test_lin, y_pred_lin, color='red', linewidth=2, label='Regression Line')
plt.xlabel("Humidity9am")
plt.ylabel("Rainfall")
plt.title("Simple Linear Regression: Humidity9am vs Rainfall")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# === 3. Multiple Linear Regression: top_features → Rainfall ===
print("\n🔹 Multiple Linear Regression (Top Features → Rainfall)")
X_mlr = cleaned[top_features]
y_mlr = cleaned['Rainfall']

X_train_mlr, X_test_mlr, y_train_mlr, y_test_mlr = train_test_split(X_mlr, y_mlr, test_size=0.2, random_state=42)

mlr_model = LinearRegression()
mlr_model.fit(X_train_mlr, y_train_mlr)
y_pred_mlr = mlr_model.predict(X_test_mlr)

print("MSE:", mean_squared_error(y_test_mlr, y_pred_mlr))
print("R² Score:", r2_score(y_test_mlr, y_pred_mlr))

# Plot predicted vs actual
plt.figure(figsize=(8, 5))
plt.scatter(y_test_mlr, y_pred_mlr, alpha=0.6, color='green')
plt.plot([y_test_mlr.min(), y_test_mlr.max()], [y_test_mlr.min(), y_test_mlr.max()], 'r--')
plt.xlabel("Actual Rainfall")
plt.ylabel("Predicted Rainfall")
plt.title("Multiple Linear Regression: Predicted vs Actual Rainfall")
plt.grid(True)
plt.tight_layout()
plt.show()

# === 4. Classification Models ===
X = cleaned[top_features]
y = cleaned['RainTomorrow']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(probability=True),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Neural Network": MLPClassifier(hidden_layer_sizes=(50, 30), max_iter=500, random_state=42)
}

best_model_name = None
best_model_score = 0
best_model_object = None

for name, model in models.items():
    print(f"\n📊 {name}")
    
    if name in ['Random Forest', 'Decision Tree']:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1]
    else:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_proba = model.predict_proba(X_test_scaled)[:, 1] if hasattr(model, "predict_proba") else model.decision_function(X_test_scaled)
    
    acc = accuracy_score(y_test, y_pred)
    print("Accuracy:", acc)
    print(classification_report(y_test, y_pred))

    if acc > best_model_score:
        best_model_score = acc
        best_model_name = name
        best_model_object = model
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
    disp.plot(cmap='Blues')
    plt.title(f"Confusion Matrix - {name}")
    plt.show()

    # ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(6, 4))
    plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
    plt.plot([0, 1], [0, 1], "k--")
    plt.title(f"ROC Curve - {name}")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # Feature Importance for Trees
    if name in ['Random Forest', 'Decision Tree']:
        importance = model.feature_importances_
        sorted_idx = importance.argsort()[::-1]
        sorted_features = np.array(top_features)[sorted_idx]
        sorted_importance = importance[sorted_idx]

        plt.figure(figsize=(8, 5))
        ax = sns.barplot(x=sorted_importance, y=sorted_features, color='teal')
        plt.title(f"Feature Importance - {name}")
        plt.xlabel("Importance Score")
        plt.ylabel("Feature")

        for i, (feat, score) in enumerate(zip(sorted_features, sorted_importance)):
            ax.text(score + 0.001, i, f"{score:.4f}", va="center", fontsize=8)

        plt.tight_layout()
        plt.show()

        print(f"🔍 {name} Feature Importances:")
        for feat, score in zip(sorted_features, sorted_importance):
            print(f"{feat}: {score:.4f}")

# === 5. Save Best Model ===
if best_model_object:
    filename = f"{best_model_name.replace(' ', '_')}_best_model.pkl"
    joblib.dump(best_model_object, filename)
    print(f"\n💾 Best Model: {best_model_name} with accuracy = {best_model_score:.4f}")
    print(f"Saved as: {filename}")
