In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
df = pd.read_csv('data/Deepression.csv')

In [None]:
# PCA for dimensionality reduction
symptoms = df.columns[1:-1]
df[symptoms] = df[symptoms].astype(float).fillna(df[symptoms].mean())

pca = PCA(n_components=2)
pca_result = pca.fit_transform(df[symptoms].astype(float))
df['PCA1'] = pca_result[:, 0]
df['PCA2'] = pca_result[:, 1]

plt.figure(figsize=(10, 6))
sns.scatterplot(x='PCA1', y='PCA2', hue='Depression State', data=df, palette='viridis')
plt.title('PCA of Symptoms')
plt.show()

In [None]:
# Clustering using KMeans
kmeans = KMeans(n_clusters=4)
df['Cluster'] = kmeans.fit_predict(df[symptoms].astype(float))

plt.figure(figsize=(10, 6))
sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=df, palette='viridis')
plt.title('KMeans Clustering of Symptoms')
plt.show()

In [None]:
print("Before dropping NaN values in 'Depression State':")
print(df['Depression State'].isna().sum())
df = df.dropna(subset=['Depression State'])
print("After dropping NaN values in 'Depression State':")
print(df['Depression State'].isna().sum())

In [None]:
# Predictive modeling
X = df[symptoms]
y = df['Depression State']

y = y.map({'No depression': 0, 'Mild': 1, 'Moderate': 2, 'Severe': 3})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap='viridis', fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()