# Final Portfolio Project 2026
## 5CS037 - Concepts and Technologies of AI

**Student Name:** Biplov Maharjan  
**Student ID:** 2462258  

---

This notebook contains the complete implementation for the Final Portfolio Project, covering both Classification and Regression tasks.

## Table of Contents
1. [Setup and Initialization](#setup)
2. [Classification Task](#classification)
3. [Regression Task](#regression)


## 1. Setup and Initialization <a id='setup'></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC, SVR
from sklearn.feature_selection import RFE

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Set plot style
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

print("Libraries imported successfully.")

### Load Datasets

In [None]:
# Define paths
class_data_path = r'classification/student_depression_dataset.csv'
reg_data_path = r'regression/avocado.csv'

# Load data
try:
    df_class = pd.read_csv(class_data_path)
    print(f"Classification Dataset Loaded: {df_class.shape}")
    display(df_class.head())
except FileNotFoundError:
    print("Classification dataset not found. Please check the path.")

try:
    df_reg = pd.read_csv(reg_data_path)
    # Check if first column is unnamed index
    if 'Unnamed: 0' in df_reg.columns:
        df_reg = df_reg.drop('Unnamed: 0', axis=1)
    print(f"Regression Dataset Loaded: {df_reg.shape}")
    display(df_reg.head())
except FileNotFoundError:
    print("Regression dataset not found. Please check the path.")

## 2. Classification Task <a id='classification'></a>

**Goal:** Predict whether a student is suffering from depression based on various academic and lifestyle factors.
**Target Variable:** `Depression` (0 or 1)
**UNSDG Alignment:** Goal 3 - Good Health and Well-being.

### 2.1 Exploratory Data Analysis (EDA)

In [None]:
# Check for missing values and data types
print(df_class.info())
print("\nMissing Values:\n", df_class.isnull().sum())

In [None]:
# Visualize Target Distribution
plt.figure(figsize=(6, 4))
sns.countplot(x='Depression', data=df_class)
plt.title('Distribution of Depression (Target Variable)')
plt.show()

In [None]:
# Visualize Age Distribution
plt.figure(figsize=(8, 5))
sns.histplot(df_class['Age'], bins=20, kde=True)
plt.title('Age Distribution of Students')
plt.show()

In [None]:
# Relationship between Sleep Duration and Depression
plt.figure(figsize=(10, 6))
sns.countplot(x='Sleep Duration', hue='Depression', data=df_class)
plt.title('Sleep Duration vs Depression')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Academic Pressure vs Depression
plt.figure(figsize=(10, 6))
sns.boxplot(x='Depression', y='Academic Pressure', data=df_class)
plt.title('Academic Pressure vs Depression')
plt.show()

### 2.2 Data Preprocessing

In [None]:
# Drop irrelevant or ID columns
# 'id' is unique identifier, 'City' might be high cardinality (we can keep if useful, but dropping for simplicity)
df_class_clean = df_class.drop(columns=['id', 'City'], errors='ignore')

# Handle Categorical Variables
# Ordinal Encoding for 'Sleep Duration' and 'Dietary Habits'
sleep_map = {'Less than 5 hours': 0, '5-6 hours': 1, '7-8 hours': 2, 'More than 8 hours': 3, 'Others': 1}
diet_map = {'Unhealthy': 0, 'Moderate': 1, 'Healthy': 2, 'Others': 1}

df_class_clean['Sleep Duration'] = df_class_clean['Sleep Duration'].map(sleep_map).fillna(1)
df_class_clean['Dietary Habits'] = df_class_clean['Dietary Habits'].map(diet_map).fillna(1)

# Label Encoding for other categoricals
le = LabelEncoder()
cat_cols = df_class_clean.select_dtypes(include=['object']).columns

for col in cat_cols:
    df_class_clean[col] = le.fit_transform(df_class_clean[col].astype(str))

print("Data Encoded Successfully")
display(df_class_clean.head())

In [None]:
# Correlation Matrix
plt.figure(figsize=(14, 10))
sns.heatmap(df_class_clean.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Feature Map')
plt.show()

In [None]:
# Split Data
X = df_class_clean.drop('Depression', axis=1)
y = df_class_clean['Depression']

# Scale Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

print("Training Shape:", X_train.shape)
print("Testing Shape:", X_test.shape)

### 2.3 Task 1: Neural Network Model

In [None]:
# Build Model
model_nn = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

# Compile
model_nn.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train
history = model_nn.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate
loss, acc = model_nn.evaluate(X_test, y_test)
print(f"\nNeural Network Test Accuracy: {acc:.4f}")

# Predictions for Metrics
y_pred_nn = (model_nn.predict(X_test) > 0.5).astype("int32")
print("\nClassification Report (Neural Network):")
print(classification_report(y_test, y_pred_nn))

In [None]:
# Plot Training History
plt.figure(figsize=(10, 5))
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.title('Neural Network Training History')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

### 2.4 Task 2: Classical Machine Learning Models
We will use **Logistic Regression** and **Random Forest Classifier** as our two primary models.

In [None]:
# 1. Logistic Regression
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)

print("Logistic Regression Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_log):.4f}")
print(classification_report(y_test, y_pred_log))

In [None]:
# 2. Random Forest Classifier
rf_clf = RandomForestClassifier(random_state=42, n_estimators=100)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)

print("Random Forest Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(classification_report(y_test, y_pred_rf))

### 2.5 Hyperparameter Optimization
We will perform hyperparameter tuning on the Random Forest model as it generally performs better.

In [None]:
# Define Hyperparameter Grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Randomized Search (faster than Grid Search)
rf_random = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=42), 
                               param_distributions=param_grid, 
                               n_iter=10, 
                               cv=3, 
                               verbose=1, 
                               random_state=42, 
                               n_jobs=-1)

rf_random.fit(X_train, y_train)

print("Best Parameters:", rf_random.best_params_)
best_rf = rf_random.best_estimator_

# Evaluate Best Model
y_pred_best_rf = best_rf.predict(X_test)
print("\nOptimized Random Forest Accuracy:", accuracy_score(y_test, y_pred_best_rf))

### 2.6 Feature Selection
Using Recursive Feature Elimination (RFE) to select the most important features.

In [None]:
rfe = RFE(estimator=RandomForestClassifier(random_state=42, n_estimators=50), n_features_to_select=10)
rfe.fit(X_train, y_train)

selected_features = pd.DataFrame({'Feature': X.columns, 'Selected': rfe.support_})
print("Top 10 Selected Features:")
print(selected_features[selected_features['Selected'] == True])

# Retrain with Selected Features
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

rf_rfe = RandomForestClassifier(random_state=42, n_estimators=100)
rf_rfe.fit(X_train_rfe, y_train)
y_pred_rfe = rf_rfe.predict(X_test_rfe)

print("\nRandom Forest with 10 Selected Features Accuracy:", accuracy_score(y_test, y_pred_rfe))

## 3. Regression Task <a id='regression'></a>

**Goal:** Predict the average price of avocados based on features like volume, bag type, and region.
**Target Variable:** `AveragePrice` (Continuous)
**UNSDG Alignment:** Goal 12 - Responsible Consumption and Production.

### 3.1 Exploratory Data Analysis (EDA)

In [None]:
# Check info
print(df_reg.info())
print("\nMissing Values:\n", df_reg.isnull().sum())

In [None]:
# Distribution of Target (Average Price)
plt.figure(figsize=(8, 5))
sns.histplot(df_reg['AveragePrice'], bins=30, kde=True, color='green')
plt.title('Distribution of Avocado Prices')
plt.show()

In [None]:
# Price over Time
df_reg['Date'] = pd.to_datetime(df_reg['Date'])
plt.figure(figsize=(12, 6))
sns.lineplot(x='Date', y='AveragePrice', data=df_reg, hue='type')
plt.title('Avocado Price Trend Over Time')
plt.show()

In [None]:
# Price vs Type
plt.figure(figsize=(8, 5))
sns.boxplot(x='type', y='AveragePrice', data=df_reg)
plt.title('Price Comparison by Type')
plt.show()

### 3.2 Data Preprocessing

In [None]:
# Extract features from Date
df_reg['Month'] = df_reg['Date'].dt.month

# Drop Date and ID-like columns (Unnamed already dropped)
df_reg_clean = df_reg.drop(columns=['Date'], errors='ignore')

# Label Encode Categorical Variables (type, region)
le_reg = LabelEncoder()
for col in ['type', 'region']:
    df_reg_clean[col] = le_reg.fit_transform(df_reg_clean[col])

# Correlation Matrix
plt.figure(figsize=(12, 8))
sns.heatmap(df_reg_clean.corr(), annot=True, fmt='.2f', cmap='Greens')
plt.title('Regression Feature Correlation')
plt.show()

# Split Data
X_reg = df_reg_clean.drop('AveragePrice', axis=1)
y_reg = df_reg_clean['AveragePrice']

# Scale Features
scaler_reg = StandardScaler()
X_reg_scaled = scaler_reg.fit_transform(X_reg)

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg_scaled, y_reg, test_size=0.2, random_state=42)

print("Regression Train Shape:", X_train_reg.shape)
print("Regression Test Shape:", X_test_reg.shape)

### 3.3 Task 1: Neural Network Regression Model

In [None]:
# Build Model
model_nn_reg = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_reg.shape[1],)),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)  # Linear activation for regression
])

# Compile
model_nn_reg.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train
history_reg = model_nn_reg.fit(X_train_reg, y_train_reg, epochs=20, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate
loss_reg, mae_reg = model_nn_reg.evaluate(X_test_reg, y_test_reg)
print(f"\nNeural Network Test MAE: {mae_reg:.4f}")

# Predictions
y_pred_nn_reg = model_nn_reg.predict(X_test_reg)
print(f"R2 Score (NN): {r2_score(y_test_reg, y_pred_nn_reg):.4f}")

### 3.4 Task 2: Classical Regression Models
We will use **Linear Regression** and **Random Forest Regressor**.

In [None]:
# 1. Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train_reg, y_train_reg)
y_pred_lin = lin_reg.predict(X_test_reg)

print("Linear Regression Performance:")
print(f"MAE: {mean_absolute_error(y_test_reg, y_pred_lin):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test_reg, y_pred_lin)):.4f}")
print(f"R2 Score: {r2_score(y_test_reg, y_pred_lin):.4f}")

In [None]:
# 2. Random Forest Regressor
rf_reg = RandomForestRegressor(random_state=42, n_estimators=100)
rf_reg.fit(X_train_reg, y_train_reg)
y_pred_rf_reg = rf_reg.predict(X_test_reg)

print("Random Forest Regressor Performance:")
print(f"MAE: {mean_absolute_error(y_test_reg, y_pred_rf_reg):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test_reg, y_pred_rf_reg)):.4f}")
print(f"R2 Score: {r2_score(y_test_reg, y_pred_rf_reg):.4f}")

### 3.5 Hyperparameter Optimization
Optimizing Random Forest Regressor.

In [None]:
# Hyperparameter Grid
param_grid_reg = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

# Randomized Search
rf_random_reg = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=42), 
                                   param_distributions=param_grid_reg, 
                                   n_iter=5, 
                                   cv=3, 
                                   verbose=1, 
                                   random_state=42,
                                   n_jobs=-1)

rf_random_reg.fit(X_train_reg, y_train_reg)

print("Best Regression Parameters:", rf_random_reg.best_params_)
best_rf_reg = rf_random_reg.best_estimator_

# Evaluate
y_pred_best_reg = best_rf_reg.predict(X_test_reg)
print(f"\nOptimized Random Forest R2: {r2_score(y_test_reg, y_pred_best_reg):.4f}")

### 3.6 Feature Selection
Selecting top features for Regression.

In [None]:
rfe_reg = RFE(estimator=RandomForestRegressor(random_state=42, n_estimators=50), n_features_to_select=8)
rfe_reg.fit(X_train_reg, y_train_reg)

selected_features_reg = pd.DataFrame({'Feature': X_reg.columns, 'Selected': rfe_reg.support_})
print("Top 8 Selected Features:")
print(selected_features_reg[selected_features_reg['Selected'] == True])

# Evaluate on Selected Features
X_train_rfe_reg = rfe_reg.transform(X_train_reg)
X_test_rfe_reg = rfe_reg.transform(X_test_reg)

rf_rfe_reg = RandomForestRegressor(random_state=42, n_estimators=100)
rf_rfe_reg.fit(X_train_rfe_reg, y_train_reg)
y_pred_rfe_reg = rf_rfe_reg.predict(X_test_rfe_reg)

print(f"\nRandom Forest with Selected Features R2: {r2_score(y_test_reg, y_pred_rfe_reg):.4f}")

# Conclusion
This notebook successfully implemented both classification and regression pipelines, including data loading, EDA, preprocessing, model building (both Neural Network and Classical), and optimization.