In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.utils import resample

# 1. Load dataset
dataset = pd.read_csv('Extracted-dataset.csv')

# 2. Encode binary categorical labels (use map to avoid FutureWarning)
dataset['RainToday'] = dataset['RainToday'].map({'No': 0, 'Yes': 1})
dataset['RainTomorrow'] = dataset['RainTomorrow'].map({'No': 0, 'Yes': 1})

# 3. Impute object/categorical columns with mode
for col in dataset.select_dtypes(include=['object']).columns:
    dataset[col] = dataset[col].fillna(dataset[col].mode()[0])

# 4. Label Encode categorical columns
lencoders = {}
for col in dataset.select_dtypes(include=['object']).columns:
    lencoders[col] = LabelEncoder()
    dataset[col] = lencoders[col].fit_transform(dataset[col])

# 5. Impute numerical columns using Iterative Imputer (MICE)
mice_imputer = IterativeImputer()
MiceImputed = pd.DataFrame(mice_imputer.fit_transform(dataset), columns=dataset.columns)

# 6. Split classes for oversampling BEFORE removing outliers
no = MiceImputed[MiceImputed['RainTomorrow'] == 0]
yes = MiceImputed[MiceImputed['RainTomorrow'] == 1]

# Oversample the minority class
if len(yes) > 0:
    yes_oversampled = resample(yes, replace=True, n_samples=len(no), random_state=123)
    oversampled = pd.concat([no, yes_oversampled])
    print("After Oversampling:", oversampled['RainTomorrow'].value_counts())
else:
    print("⚠ No samples with RainTomorrow == 1 — oversampling not possible.")
    oversampled = MiceImputed.copy()

# 7. Now remove outliers (after balancing)
Q1 = oversampled.quantile(0.25)
Q3 = oversampled.quantile(0.75)
IQR = Q3 - Q1
cleaned = oversampled[~((oversampled < (Q1 - 1.5 * IQR)) | (oversampled > (Q3 + 1.5 * IQR))).any(axis=1)]

print("Final shape after outlier removal:", cleaned.shape)
print("Final class distribution:\n", cleaned['RainTomorrow'].value_counts(normalize=True))

# 8. Visualize class balance
plt.figure(figsize=(8,5))
cleaned['RainTomorrow'].value_counts(normalize=True).plot(kind='bar', color=['skyblue', 'navy'])
plt.title('Balanced Dataset After Cleaning & Outlier Removal')
plt.xlabel('RainTomorrow (0 = No, 1 = Yes)')
plt.ylabel('Proportion')
plt.show()

# 9. Optional: Heatmap of missing values (should be empty)
sns.heatmap(cleaned.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap (Should Be Clean)')
plt.show()

# 10. Pearson Correlation with RainTomorrow
numeric_df = cleaned.select_dtypes(include=['float64', 'int64'])

correlation_with_target = numeric_df.corr()['RainTomorrow'].sort_values(ascending=False)
print(correlation_with_target)

# 11. Barplot of Correlation
plt.figure(figsize=(10, 8))
sns.barplot(x=correlation_with_target.drop('RainTomorrow').values,
            y=correlation_with_target.drop('RainTomorrow').index,
            palette='coolwarm')
plt.title('Correlation of Features with RainTomorrow')
plt.xlabel('Correlation Coefficient')
plt.ylabel('Features')
plt.tight_layout()
plt.show()

In [None]:
# === IMPORTS ===
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# === 1. FEATURE SELECTION (optional) ===

# Correlation matrix
plt.figure(figsize=(10, 8))
corr = cleaned.corr()
sns.heatmap(corr, annot=False, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

# Choose features most correlated with RainTomorrow (excluding it)
target_corr = corr['RainTomorrow'].drop('RainTomorrow')
top_features = target_corr[abs(target_corr) > 0.1].index.tolist()  # You can change 0.1 to any threshold
print("Selected Features for Classification:", top_features)

# === 2. REGRESSION MODELS (Rainfall as target) ===

## SIMPLE LINEAR REGRESSION: Humidity9am → Rainfall
X_lin = cleaned[['Humidity9am']]
y_lin = cleaned['Rainfall']

X_train_lin, X_test_lin, y_train_lin, y_test_lin = train_test_split(X_lin, y_lin, test_size=0.2, random_state=42)

lin_reg = LinearRegression()
lin_reg.fit(X_train_lin, y_train_lin)
y_pred_lin = lin_reg.predict(X_test_lin)

print("\n✅ Simple Linear Regression")
print("MSE:", mean_squared_error(y_test_lin, y_pred_lin))
print("R²:", r2_score(y_test_lin, y_pred_lin))

## MULTIPLE LINEAR REGRESSION: top_features → Rainfall
X_mlr = cleaned[top_features]
y_mlr = cleaned['Rainfall']

X_train_mlr, X_test_mlr, y_train_mlr, y_test_mlr = train_test_split(X_mlr, y_mlr, test_size=0.2, random_state=42)

mlr = LinearRegression()
mlr.fit(X_train_mlr, y_train_mlr)
y_pred_mlr = mlr.predict(X_test_mlr)

print("\n✅ Multiple Linear Regression")
print("MSE:", mean_squared_error(y_test_mlr, y_pred_mlr))
print("R²:", r2_score(y_test_mlr, y_pred_mlr))

# === 3. CLASSIFICATION MODELS (RainTomorrow as target) ===

# Define input features and target
X = cleaned[top_features]  # You can use cleaned.drop(columns='RainTomorrow') for full features
y = cleaned['RainTomorrow']

# Split & scale
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# === MODEL DEFINITIONS ===
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Neural Network": MLPClassifier(hidden_layer_sizes=(50, 30), max_iter=500, random_state=42)
}

# === MODEL TRAINING AND EVALUATION ===
for name, model in models.items():
    print(f"\n📊 {name}")
    
    # Use scaled data for non-tree models
    if name in ['Random Forest', 'Decision Tree']:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    else:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))


In [None]:
# === IMPORTS ===
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    accuracy_score, mean_squared_error, r2_score,
    classification_report, confusion_matrix,
    ConfusionMatrixDisplay, roc_curve, auc
)
import joblib

# === LOAD OR SIMULATE DATA ===
from sklearn.datasets import make_classification

# Simulate data (replace with your cleaned dataset)
X_dummy, y_dummy = make_classification(n_samples=3000, n_features=10, n_informative=6, n_classes=2, random_state=42)
cleaned = pd.DataFrame(X_dummy, columns=[f'Feature{i}' for i in range(X_dummy.shape[1])])
cleaned['Humidity9am'] = np.random.rand(3000) * 100
cleaned['Rainfall'] = np.random.rand(3000) * 10
cleaned['RainTomorrow'] = y_dummy

# === 1. FEATURE SELECTION ===
plt.figure(figsize=(10, 8))
corr = cleaned.corr()
sns.heatmap(corr, annot=False, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

target_corr = corr['RainTomorrow'].drop('RainTomorrow')
top_features = target_corr[abs(target_corr) > 0.1].index.tolist()
print("Selected Features:", top_features)

# === 2. SIMPLE LINEAR REGRESSION ===
print("\n🔹 Simple Linear Regression (Humidity9am → Rainfall)")
X_lin = cleaned[['Humidity9am']]
y_lin = cleaned['Rainfall']
X_train_lin, X_test_lin, y_train_lin, y_test_lin = train_test_split(X_lin, y_lin, test_size=0.2, random_state=42)

lin_reg = LinearRegression()
lin_reg.fit(X_train_lin, y_train_lin)
y_pred_lin = lin_reg.predict(X_test_lin)

print("MSE:", mean_squared_error(y_test_lin, y_pred_lin))
print("R²:", r2_score(y_test_lin, y_pred_lin))

# === 3. MULTIPLE LINEAR REGRESSION ===
print("\n🔹 Multiple Linear Regression (Top Features → Rainfall)")
X_mlr = cleaned[top_features]
y_mlr = cleaned['Rainfall']
X_train_mlr, X_test_mlr, y_train_mlr, y_test_mlr = train_test_split(X_mlr, y_mlr, test_size=0.2, random_state=42)

mlr = LinearRegression()
mlr.fit(X_train_mlr, y_train_mlr)
y_pred_mlr = mlr.predict(X_test_mlr)

print("MSE:", mean_squared_error(y_test_mlr, y_pred_mlr))
print("R²:", r2_score(y_test_mlr, y_pred_mlr))

# === 4. CLASSIFICATION MODELS ===
X = cleaned[top_features]
y = cleaned['RainTomorrow']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(probability=True),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Neural Network": MLPClassifier(hidden_layer_sizes=(50, 30), max_iter=500, random_state=42)
}

# === 5. EVALUATION LOOP ===
best_model_name = None
best_model_score = 0
best_model_object = None

for name, model in models.items():
    print(f"\n📊 {name}")

    if name in ['Random Forest', 'Decision Tree']:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1]
    else:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        if hasattr(model, "predict_proba"):
            y_proba = model.predict_proba(X_test_scaled)[:, 1]
        else:
            y_proba = model.decision_function(X_test_scaled)

    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    print(classification_report(y_test, y_pred))

    # Save best model
    if accuracy > best_model_score:
        best_model_score = accuracy
        best_model_name = name
        best_model_object = model

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
    disp.plot(cmap='Blues')
    plt.title(f"Confusion Matrix: {name}")
    plt.show()

    # ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(6, 4))
    plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve: {name}')
    plt.legend(loc='lower right')
    plt.grid(True)
    plt.show()

    # Feature importance (trees only)
    if name in ['Random Forest', 'Decision Tree']:
        importance = model.feature_importances_
        sorted_idx = np.argsort(importance)[::-1]
        sorted_features = np.array(top_features)[sorted_idx]
        sorted_importance = importance[sorted_idx]

        plt.figure(figsize=(8, 5))
        sns.barplot(x=sorted_importance, y=sorted_features)
        plt.title(f'Feature Importance: {name}')
        plt.xlabel("Importance")
        plt.ylabel("Feature")
        plt.tight_layout()
        plt.show()

# === 6. SAVE BEST MODEL ===
if best_model_object:
    filename = f'{best_model_name.replace(" ", "_")}_best_model.pkl'
    joblib.dump(best_model_object, filename)
    print(f"\n💾 Best model saved: {best_model_name} with accuracy = {best_model_score:.4f}")
    print(f"File saved as: {filename}")
