# Import Required Libraries
Import necessary libraries such as pandas, numpy, matplotlib, seaborn, and sklearn.

In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load and Inspect Dataset
Load the dataset using pandas, display the first few rows, and check the dataset's structure using `df.info()`.

In [None]:
# Load and Inspect Dataset
df = pd.read_csv("DepressionData.csv")
print(df.head())
df.info()

# Handle Missing Values
Drop rows with missing values and verify the absence of null values using `df.isnull().sum()`.

In [None]:
# Handle Missing Values
df.dropna(inplace=True)
print(df.isnull().sum())

# Explore Unique Values
Explore unique values in categorical columns such as 'Financial Stress', 'Work Pressure', 'Job Satisfaction', etc.

In [None]:
# Explore Unique Values
print(df["Financial Stress"].unique())
print(df["Work Pressure"].unique())
print(df["Job Satisfaction"].unique())

# Map Categorical Data
Map categorical data to numerical values for columns like 'City', 'Degree', 'Dietary Habits', 'Gender', and others.

In [None]:
# Map Categorical Data
city_mapping = {value: idx + 1 for idx, value in enumerate(df["City"].unique())}
df["City"] = df["City"].map(city_mapping)

degree_mapping = {value: idx + 1 for idx, value in enumerate(df["Degree"].unique())}
df["Degree"] = df["Degree"].map(degree_mapping)

dietary_mapping = {"Healthy": 1, "Moderate": 2, "Unhealthy": 3, "Others": 4}
df["Dietary Habits"] = df["Dietary Habits"].map(dietary_mapping)

gender_mapping = {"Male": 1, "Female": 2}
df["Gender"] = df["Gender"].map(gender_mapping)

# Transform Sleep Duration
Extract numerical values from the 'Sleep Duration' column and calculate the midpoint for intervals.

In [None]:
# Transform Sleep Duration
import re

def extract_numbers(input_string):
    numbers = list(map(float, re.findall(r"\d+\.?\d*", input_string)))
    return numbers

def calculate_midpoint(numbers):
    if len(numbers) == 1:
        return numbers[0]
    elif len(numbers) == 2:
        return (numbers[0] + numbers[1]) / 2

df["Sleep Duration"] = df["Sleep Duration"].apply(extract_numbers).apply(calculate_midpoint)

# Correlation Analysis
Generate a correlation matrix and visualize it using a heatmap with seaborn.

In [None]:
# Correlation Analysis
plt.figure(figsize=(10, 10))
sns.heatmap(df.corr(), annot=True, fmt=".1f", cmap="viridis")
plt.title("Heatmap of Correlation")
plt.savefig("CorrelationHeatmap.jpg")
plt.show()

# Feature Engineering
Create a new feature 'total Stress' by summing up 'Academic Pressure', 'Work Pressure', and 'Financial Stress'.

In [None]:
# Feature Engineering
df["total Stress"] = df["Academic Pressure"] + df["Work Pressure"] + df["Financial Stress"]

# Train and Evaluate Model
Split the data into training and testing sets, train a RandomForestClassifier, and evaluate its performance using accuracy and classification report.

In [None]:
# Train and Evaluate Model
X = df[["total Stress", "Sleep Duration"]]
Y = df["Depression"]

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

df["Depression_Predicted"] = model.predict(X)

# Save and Reload Processed Data
Save the processed dataset to a CSV file and reload it for further analysis or visualization.

In [None]:
# Save and Reload Processed Data
df.to_csv("ProcessedDepressionData.csv", index=False)
df = pd.read_csv("ProcessedDepressionData.csv")
print(df.head())