In [None]:
# Step 1: Import EVERYTHING we'll need
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# from scipy import stats
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.cluster import KMeans
# from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Mendeley Data : Dataset Kesehatan Mental berdasarkan DASS-21

In [None]:
import matplotlib
print(matplotlib.__file__)

In [None]:
data_mendeley = pd.read_csv ('../dataset/mendeley_DASS.csv')
data_mendeley.head(5)

In [None]:
data_mendeley.shape

In [None]:
data_mendeley.describe()

In [None]:
data_mendeley.isnull().sum()

In [None]:
data_mendeley.dtypes

In [None]:
rename_dict = {
    'Q1_1': 'Age',
    'Q1_2': 'Gender',
    'Q1_3': 'Marital_Status',
    'Q1_4': 'Educational_Status',
    'Q1_5': 'Occupational_Status',
    'Q1_6': 'Sleeping_Problem',

    # Stress items
    'Q3_1_S1': 'Stress_Item1',
    'Q3_2_S2': 'Stress_Item2',
    'Q3_3_S3': 'Stress_Item3',
    'Q3_4_S4': 'Stress_Item4',
    'Q3_5_S5': 'Stress_Item5',
    'Q3_6_S6': 'Stress_Item6',
    'Q3_7_S7': 'Stress_Item7',
    'Stress_Score': 'Stress_Total_Score',
    'Stress_Level': 'Stress_Level_Category',

    # Anxiety items
    'Q3_8_A1': 'Anxiety_Item1',
    'Q3_9_A2': 'Anxiety_Item2',
    'Q3_10_A3': 'Anxiety_Item3',
    'Q3_11_A4': 'Anxiety_Item4',
    'Q3_12_A5': 'Anxiety_Item5',
    'Q3_13_A6': 'Anxiety_Item6',
    'Q3_14_A7': 'Anxiety_Item7',
    'Anxiety_Score': 'Anxiety_Total_Score',
    'Anxiety_Level': 'Anxiety_Level_Category',

    # Depression items
    'Q3_15_D1': 'Depression_Item1',
    'Q3_16_D2': 'Depression_Item2',
    'Q3_17_D3': 'Depression_Item3',
    'Q3_18_D4': 'Depression_Item4',
    'Q3_19_D5': 'Depression_Item5',
    'Q3_20_D6': 'Depression_Item6',
    'Q3_21_D7': 'Depression_Item7',
    'Depression_Score': 'Depression_Total_Score',
    'Depression_Level': 'Depression_Level_Category'
}

In [None]:
data_renamed = data_mendeley.rename(columns=rename_dict)

In [None]:
print("Original columns:", data_mendeley.columns.tolist())
print("\nRenamed columns:", data_renamed.columns.tolist())
print(f"\nDataset shape: {data_renamed.shape}")

### Data Quality check

In [None]:
print("Missing values in each column:")
missing_data = data_renamed.isnull().sum()
print(missing_data[missing_data > 0])
if missing_data.sum() == 0:
    print("No missing values found!")

In [None]:
# Identify all duplicates
duplicate_mask = data_renamed.duplicated(keep=False)  # Mark ALL duplicates
exact_duplicates = data_renamed.duplicated()  # True for duplicates (not first occurrence)

print(f"Total rows with duplicate patterns: {duplicate_mask.sum()}")
print(f"Exact duplicate rows to remove: {exact_duplicates.sum()}")

In [None]:
# Analyze duplicate patterns
duplicate_groups = data_renamed[duplicate_mask].groupby(list(data_renamed.columns)).size().reset_index(name='count')
print(f"Number of unique duplicate patterns: {len(duplicate_groups)}")
print("\nDuplicate frequency distribution:")
print(duplicate_groups['count'].value_counts().sort_index())

In [None]:
# Show examples of duplicates
print("\nSample of duplicate cases:")
sample_duplicates = data_renamed[duplicate_mask].head(10)
print(sample_duplicates[['Age', 'Gender', 'Marital_Status', 'Stress_Total_Score', 'Anxiety_Total_Score', 'Depression_Total_Score']])


In [None]:
# DECISION: Remove exact duplicate responses (supported by literature)
print("\n=== DECISION: Duplicate Removal Justification ===")
print(
    "Exact duplicate rows were detected in the dataset. Based on best practices in survey-based "
    "psychological research and data preprocessing, removing fully identical responses is necessary "
    "to prevent artificial inflation of sample size, biased statistical estimates, and distortion "
    "of class distributions.\n"
    "This practice is supported by peer-reviewed literature, including:\n"
    "• Kennedy et al. (2020) – Journal of Survey Statistics and Methodology, which emphasizes that\n"
    "  duplicate survey submissions introduce systematic bias and must be removed to preserve data integrity.\n"
)

data_clean = data_renamed.drop_duplicates().copy()

print(f"Clean dataset shape: {data_clean.shape}")
print(f"Rows removed: {len(data_renamed) - len(data_clean)}")


In [None]:
# Verify no duplicates remain
final_duplicates = data_clean.duplicated().sum()
print(f"Duplicates in clean data: {final_duplicates}")

if final_duplicates == 0:
    print("HYPOTHESIS 1 CONFIRMED: Duplicates were genuine cases, now handled properly")
else:
    print("HYPOTHESIS 1 REJECTED: Duplicates still present")

##  DATA QUALITY & BASIC STATISTICS

In [None]:
print("\n=== DATA QUALITY ASSESSMENT ===")

# Check for missing values
missing_values = data_clean.isnull().sum()
print("Missing values per column:")
print(missing_values[missing_values > 0])
if missing_values.sum() == 0:
    print("No missing values")

In [None]:
# Check data types
print("\nData types:")
print(data_clean.dtypes.value_counts())


In [None]:
# Basic statistics
print("\n=== BASIC STATISTICS ===")
demographic_cols = ['Age', 'Gender', 'Marital_Status', 'Educational_Status', 'Occupational_Status', 'Sleeping_Problem']
score_cols = ['Stress_Total_Score', 'Anxiety_Total_Score', 'Depression_Total_Score']
level_cols = ['Stress_Level_Category', 'Anxiety_Level_Category', 'Depression_Level_Category']

print("Demographic statistics:")
print(data_clean[demographic_cols].describe())

print("\nMental health score statistics:")
print(data_clean[score_cols].describe())

In [None]:
# Test Hypothesis 2
if missing_values.sum() == 0 and final_duplicates == 0:
    print("✅ HYPOTHESIS 2 CONFIRMED: Data quality is excellent for analysis")
else:
    print("❌ HYPOTHESIS 2 REJECTED: Data quality issues need addressing")

## ENHANCED DEMOGRAPHIC VISUALIZATION

In [None]:
# Step 5: Enhanced Demographic Visualization
print("\n=== ENHANCED DEMOGRAPHIC VISUALIZATION ===")

# Create comprehensive demographic dashboard
fig = plt.figure(figsize=(20, 15))

In [None]:
print("\n=== AGE DISTRIBUTION ===")
plt.figure(figsize=(6,4))
sns.histplot(data=data_clean, x='Age', kde=True, bins=15, color='skyblue')
plt.title('Age Distribution', fontweight='bold')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()


In [None]:
# 2. Gender Distribution
print("\n=== GENDER DISTRIBUTION ===")
plt.figure(figsize=(6,4))
gender_counts = data_clean['Gender'].value_counts()
plt.pie(
    gender_counts.values,
    labels=[f'Gender {x}' for x in gender_counts.index],
    autopct='%1.1f%%',
    colors=['#ff9999', '#66b3ff', '#99ff99']
)
plt.title('Gender Distribution', fontweight='bold')
plt.show()


In [None]:
plt.figure(figsize=(6,4))

marital_counts = data_clean['Marital_Status'].value_counts().sort_index()

plt.bar(
    marital_counts.index,
    marital_counts.values,
    width=0.4,     # keep bars tight
    color=['#ff6b6b', '#4ecdc4']
)

plt.xticks([0, 1], ['Not Married (0)', 'Married (1)'])
plt.title('Marital Status Distribution', fontweight='bold')
plt.xlabel('Marital Status')
plt.ylabel('Count')
plt.show()


In [None]:
# 4. Educational Status
print("\n=== EDUCATIONAL STATUS DISTRIBUTION ===")
plt.figure(figsize=(6,4))
edu_counts = data_clean['Educational_Status'].value_counts().sort_index()
plt.bar(
    edu_counts.index,
    edu_counts.values,
    color=['#feca57', '#ff9ff3', '#54a0ff', '#00d2d3', '#5f27cd']
)
plt.title('Educational Status Distribution', fontweight='bold')
plt.xlabel('Educational Status Code')
plt.ylabel('Count')
plt.show()
print("Q 1.4 Educational Status")
print("")
print("1. Illiterate ")
print("2. Primary")
print("3. SSC")
print("4. HSC")
print("5. Graduation and above")

# Kaggle : Mental Health in Tech Survey

In [None]:
data_kaggle = pd.read_csv ('kaggle_survey.csv')
data_kaggle

In [None]:
data_kaggle.columns = data_kaggle.columns.str.lower()

In [None]:
data_kaggle

In [None]:
print("Jumlah kolom :", len(data_kaggle.columns))
print("Nama kolom   :", list(data_kaggle.columns))

### Figshare : Mental health data of 776 Mexican medical students (PHQ-9, GAD-7 and Epworth Sleepiness Scale Scores)

In [None]:
data_figshare = pd.read_csv ('figshare_mental_health.csv')
data_figshare

In [None]:
import numpy as np

phq_cols = [f"phq{i}" for i in range(1,10)]
gad_cols = [f"gad{i}" for i in range(1,8)]

data_figshare["depression"] = data_figshare[phq_cols].sum(axis=1)
data_figshare["anxiety"] = data_figshare[gad_cols].sum(axis=1)

data_kaggle["depression"] = data_kaggle["treatment"].apply(lambda x: 1 if x=="Yes" else 0)
data_kaggle["anxiety"] = data_kaggle["work_interfere"].replace({
    "Never":0,"Rarely":1,"Sometimes":2,"Often":3
})

In [None]:
key_cols = ["age", "gender", "depression", "anxiety"]

mendeley_key = data_mendeley[key_cols]
kaggle_key = data_kaggle[key_cols]
figshare_key = data_figshare[key_cols]

In [None]:
data_all = pd.concat([figshare_key, kaggle_key, mendeley_key], ignore_index=True)

In [None]:
data_all_all = data_all.dropna(subset=["gender"])

In [None]:
data_all = data_all.dropna(subset=["gender"])

In [None]:
data_all["gender"] = data_all["gender"].astype(str).str.strip().str.lower()
data_all["gender"] = data_all["gender"].replace({
    "male": "male", 
    "m": "male",
    "1": "male",    
    
    "female": "female", 
    "f": "female",
    "2": "female" 
})

data_all.loc[~data_all["gender"].isin(["male", "female", "nonbinary"]), "gender"] = np.nan

In [None]:
data_all

In [None]:
print("Summary Statistik:")
print(data_all.describe())

print("\nRata-rata depresi per gender:")
print(data_all.groupby("gender")["depression"].mean())

print("\nRata-rata anxiety per gender:")
print(data_all.groupby("gender")["anxiety"].mean())

print("\nKorelasi antar variabel mental health:")
print(data_all[["depression","anxiety","age"]].corr())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
numeric_cols = ['depression', 'anxiety', 'age']

corr_matrix = data_all[numeric_cols].corr()

plt.figure(figsize=(6,5))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".3f")
plt.title("Heatmap Korelasi Variabel Mental Health")
plt.show()

In [None]:
plt.figure(figsize=(6,4))
sns.boxplot(data=data_all, x="gender", y="depression")
plt.title("Depression berdasarkan Gender")
plt.show()

plt.figure(figsize=(6,4))
sns.boxplot(data=data_all, x="gender", y="anxiety")
plt.title("Anxiety berdasarkan Gender")
plt.show()


##### Status Pekerjaan

In [None]:
if "Occupational_Status" in data_mendeley.columns:
    data_mendeley["occupation_category"] = "Other"
    data_mendeley.loc[data_mendeley["Occupational_Status"].str.contains("student", case=False, na=False),
                      "occupation_category"] = "Student"
    data_mendeley.loc[data_mendeley["Occupational_Status"].str.contains("unemployed", case=False, na=False),
                      "occupation_category"] = "Unemployed"
    data_mendeley.loc[data_mendeley["Occupational_Status"].str.contains("employee|worker|staff", case=False, na=False),
                      "occupation_category"] = "General_Worker"
else:
    data_mendeley["occupation_category"] = "Other"

data_kaggle["occupation_category"] = "Tech_Worker"

data_figshare["occupation_category"] = "Student"


In [None]:
key_cols_extended = ["depression", "anxiety", "occupation_category"]

mendeley_key = data_mendeley[key_cols_extended]
kaggle_key = data_kaggle[key_cols_extended]
figshare_key = data_figshare[key_cols_extended]

data_all= pd.concat([figshare_key, kaggle_key, mendeley_key], ignore_index=True)

In [None]:
data_all

In [None]:
analysis = data_all.groupby("occupation_category")[["depression", "anxiety"]].mean().round(2)
print("=== Rata-rata Depresi & Anxiety Berdasarkan Status Pekerjaan ===")
print(analysis)

In [None]:
plt.figure(figsize=(8,5))
sns.barplot(data=data_all, x="occupation_category", y="depression")
plt.title("Rata-rata Skor Depresi Berdasarkan Status Pekerjaan")
plt.xlabel("Status Pekerjaan")
plt.ylabel("Skor Depresi")
plt.xticks(rotation=20)
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.barplot(data=data_all, x="occupation_category", y="anxiety")
plt.title("Rata-rata Skor Anxiety Berdasarkan Status Pekerjaan")
plt.xlabel("Status Pekerjaan")
plt.ylabel("Skor Anxiety")
plt.xticks(rotation=20)
plt.show()