# Importing Required Libraries
This section explains the purpose of importing libraries like pandas, numpy, matplotlib, seaborn, and sklearn. These libraries are essential for data manipulation, visualization, and machine learning tasks.

In [None]:
# Importing Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Loading the Dataset
The dataset is loaded using pandas, and its structure is briefly explored to understand the data.

In [None]:
# Loading the Dataset
df = pd.read_csv("DepressionData.csv")
df.head()

# Exploring the Dataset
We use methods like `df.info()`, `df.head()`, and `df.describe()` to explore the dataset and gain insights into its structure and content.

In [None]:
# Exploring the Dataset
df.info()
df.describe()

# Handling Missing Values
Missing values are handled using methods like `dropna()` and `isnull().sum()`. This ensures the dataset is clean and ready for analysis.

In [None]:
# Handling Missing Values
df.dropna(inplace=True)
df.isnull().sum()

# Encoding Categorical Variables
Categorical variables are mapped to numerical values using dictionaries and the `map()` function. This step is crucial for machine learning models that require numerical input.

In [None]:
# Encoding Categorical Variables
mapping_value = {value: idx + 1 for idx, value in enumerate(df["City"].unique())}
df["City"] = df["City"].map(mapping_value)

# Feature Engineering
New features like `total Stress` are created, and existing features like `Sleep Duration` are transformed to enhance the dataset's predictive power.

In [None]:
# Feature Engineering
df["total Stress"] = df["Academic Pressure"] + df["Work Pressure"] + df["Financial Stress"]

def extract_numbers(input_string, as_int=True):
    import re
    if as_int:
        numbers = list(map(int, re.findall(r'\d+', input_string)))
    else:
        numbers = list(map(float, re.findall(r'\d+\.?\d*', input_string)))
    return numbers

df["Sleep Duration"] = df["Sleep Duration"].apply(extract_numbers)

def calculate_midpoint(numbers):
    if len(numbers) == 1:
        return numbers[0]
    elif len(numbers) == 2:
        return (numbers[0] + numbers[1]) / 2

df["Sleep Duration"] = df["Sleep Duration"].apply(calculate_midpoint)

# Correlation Analysis
Correlation between features is analyzed using `df.corr()` and visualized using a heatmap with seaborn. This helps identify relationships between variables.

In [None]:
# Correlation Analysis
plt.figure(figsize=(10, 10))
sns.heatmap(df.corr(), annot=True, fmt='.1f', cmap='viridis')
plt.title("Heatmap of the Correlation")
plt.savefig("HeatMap.jpg")
plt.show()

# Building the Machine Learning Model
The dataset is split into training and testing sets. A RandomForestClassifier is trained, and predictions are made.

In [None]:
# Building the Machine Learning Model
X = df[['total Stress', 'Sleep Duration', 'Dietary Habits', 
        'Family History of Mental Illness', 'Job Satisfaction', 
        'Age', 'Profession', 'Have you ever had suicidal thoughts ?']]
Y = df['Depression']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
model = RandomForestClassifier(random_state=42, class_weight='balanced')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluating the Model
The model is evaluated using metrics like `accuracy_score`, `classification_report`, and a confusion matrix.

In [None]:
# Evaluating the Model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Depression', 'Depression'], yticklabels=['No Depression', 'Depression'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

# Saving and Reloading the Processed Data
The processed data is saved to a CSV file and reloaded for further analysis.

In [None]:
# Saving and Reloading the Processed Data
df.to_csv("NewData.csv", index=False)
df = pd.read_csv("NewData.csv")
df.head()