**ML ASSIGNMENT**

In [5]:
# ‚úÖ Install libraries if needed (for Google Colab only)
# !pip install pandas scikit-learn plotly

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [6]:
# -----------------------------
# ‚úÖ Load Dataset
# -----------------------------
df = pd.read_csv("/content/Energy_deficit.csv")   # <-- replace with your CSV file

print("\nüìä Preview of the Dataset:")
print(df.head())
print("\nüîé Columns Available:", df.columns.tolist())

# -----------------------------
# ‚úÖ Auto-detect text + target columns
# -----------------------------
text_cols = df.select_dtypes(include="object").columns.tolist()

if len(text_cols) == 0:
    raise Exception("‚ùå No text column found. Ensure dataset contains text.")

print("\nüìå Text-like columns detected:", text_cols)

text_column = text_cols[0]  # pick first text column automatically
print(f"‚úÖ Selected text column ‚Üí {text_column}")

# Target column must be categorical or few unique values
target_candidates = [col for col in df.columns if col != text_column and df[col].nunique() <= 20]

if len(target_candidates) == 0:
    raise Exception("‚ùå No suitable categorical label found. Please preprocess dataset.")

target_column = target_candidates[0]
print(f"‚úÖ Selected target column ‚Üí {target_column}")

# -----------------------------
# ‚úÖ Encode Target
# -----------------------------
label_enc = LabelEncoder()
y = label_enc.fit_transform(df[target_column].astype(str))
X = df[text_column]

# -----------------------------
# ‚úÖ Train/Test Split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

results = {}

# -----------------------------
# ‚úÖ Logistic Regression Model
# -----------------------------
log_reg_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000, stop_words="english")),
    ("clf", LogisticRegression(max_iter=2000))
])

log_reg_pipeline.fit(X_train, y_train)
log_pred = log_reg_pipeline.predict(X_test)

results["Logistic Regression"] = {
    "Accuracy": accuracy_score(y_test, log_pred),
    "Macro F1 Score": f1_score(y_test, log_pred, average="macro")
}

# -----------------------------
# ‚úÖ Random Forest Model
# -----------------------------
rf_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000, stop_words="english")),
    ("clf", RandomForestClassifier(n_estimators=300, random_state=42))
])

rf_pipeline.fit(X_train, y_train)
rf_pred = rf_pipeline.predict(X_test)

results["Random Forest"] = {
    "Accuracy": accuracy_score(y_test, rf_pred),
    "Macro F1 Score": f1_score(y_test, rf_pred, average="macro")
}

# -----------------------------
# ‚úÖ Final Results
# -----------------------------
print("\nüìà Model Performance Results:\n")
for model, metrics in results.items():
    print(f"üîπ {model}")
    for metric, value in metrics.items():
        print(f"   {metric}: {value:.4f}")

best_model = max(results, key=lambda m: results[m]["Macro F1 Score"])
print(f"\nüèÜ Best Model ‚Üí {best_model}")



üìä Preview of the Dataset:
  Region                  State  is_union_territory Month Quarter  \
0  North             Chandigarh                   1   Mar      Q1   
1  North                  Delhi                   1   Mar      Q1   
2  North                Haryana                   0   Mar      Q1   
3  North       Himachal Pradesh                   0   Mar      Q1   
4  North  UTs of J&K and Ladakh                   1   Mar      Q1   

   Energy Requirement MU  Energy Availability MU  Energy_Deficit  
0                101.600                 101.600           0.000  
1               2130.478                2130.279           0.199  
2               3911.822                3909.160           2.662  
3               1025.630                1023.160           2.470  
4               1725.610                1717.930           7.680  

üîé Columns Available: ['Region', 'State', 'is_union_territory', 'Month', 'Quarter', 'Energy Requirement MU', 'Energy Availability MU', 'Energy_Deficit