In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/anxiety-and-depression-mental-health-factors/anxiety_depression_data.csv')

In [None]:
df.head()

### 1: Looking at general information about the dataset (missing values, data types, statistics)

In [None]:
# Step 1: General information of the dataset
df_info = df.info()
df_description = df.describe(include='all')
missing_values = df.isnull().sum()

# Let's filter only the columns with missing data
missing_values = missing_values[missing_values > 0]



missing_values


### 1: General View of Dataset;

-The dataset contains 1200 observations (rows) and 21 features (columns).

-No missing values, 1200 data in all columns.

-Types of variables:

-Numeric (int/float): 16 columns

-Categorical (object): 5 columns (Gender, Education_Level, Employment_Status, Drug_Use, Substance_Use)

## 2-Distribution and statistical analysis of numerical variables

In [None]:
import matplotlib.pyplot as plt

# Let's select the numeric columns
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Let's draw the scatter plot of each numerical variable
for col in numeric_cols:
    plt.figure(figsize=(6, 4))
    plt.hist(df[col], bins=30)
    plt.title(f'{col} Distribution')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()


### 2: Distributions of Numerical Variables

Age: Commonly between the ages of 20-70, mostly adults.

Anxiety_Score, Depression_Score, Stress_Level: Psychological scales are stuck in a certain range (probably around 0-20), may be skewed to the right or left.

Sleep_Hours and Physical_Activity_Hrs: Generally close to normal distribution, but may have outliers.

Financial_Stress and Work_Stress: Scores are mostly concentrated in the middle levels.

Self_Esteem_Score, Life_Satisfaction_Score, Loneliness_Score: Data on emotional well-being appear balanced, but their distributions may differ.

## 3: Analyze the distribution of categorical variables

In [None]:
# Let's select categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Let's visualize the distribution of each categorical variable
for col in categorical_cols:
    plt.figure(figsize=(6, 4))
    df[col].value_counts().plot(kind='bar')
    plt.title(f'{col} Distribution')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()


### 3: Distributions of Categorical Variables

When we look at the categorical columns:

Gender: There seems to be a balanced distribution between male and female participants.

Education_Level: "Bachelor's" and "Master's" levels are prominent. High school and other levels are less represented.

Employment_Status: Employed individuals are in the majority, but there are also a significant number of unemployed and retired individuals.

Medication_Use and Substance_Use: The "None" group is clearly dominant; the number of people using medication or substances is lower.

## 4: Correlation Analysis

In [None]:
import seaborn as sns
import numpy as np

# Let's calculate the correlation matrix between numerical variables
correlation_matrix = df[numeric_cols].corr()

# Let's visualize correlations
plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", square=True)
plt.title("Correlation Matrix Between Numerical Variables")
plt.tight_layout()
plt.show()


### 4: Correlation Analysis

Some relationships that stand out according to the correlation matrix:

There are positive and strong relationships between Anxiety_Score, Depression_Score, Stress_Level. These 3 variables are highly correlated with each other, which is expected.

With Self_Esteem_Score:

There is a negative correlation between Depression_Score, Anxiety_Score. In other words, these scores decrease as self-esteem increases.

With Life_Satisfaction_Score:

There is a negative relationship between Depression_Score and Anxiety_Score. Depression and anxiety are less in individuals with high life satisfaction.

There is a positive correlation between Loneliness_Score and depression and anxiety. These emotional states worsen as loneliness increases.

There are weak but negative relationships between Sleep_Hours and Physical_Activity_Hrs and psychological scores: more sleep and physical activity are generally associated with lower stress/depression/anxiety.



## Anxiety_Score Grouping

In [None]:
# Let's break down Anxiety_Score into categories
# Example ranges: 0-6 (Low), 7-13 (Medium), 14+ (High) — common clinical classifications
def categorize_anxiety(score):
    if score <= 6:
        return 'Low'
    elif score <= 13:
        return 'Medium'
    else:
        return 'High'

df['Anxiety_Level'] = df['Anxiety_Score'].apply(categorize_anxiety)

# Now let's look at the distribution of Anxiety_Level according to categorical variables
anxiety_by_cats = {}

for col in categorical_cols:
    counts = df.groupby(col)['Anxiety_Level'].value_counts(normalize=True).unstack().fillna(0)
    anxiety_by_cats[col] = counts



# We can show the others in order later
anxiety_by_cats


### Anxiety_Score Grouping

Low: 0–6

Medium: 7–13

High: 14 and above

### Analysis of Anxiety Levels According to Categorical Variables:

First, I showed the distribution according to the Gender variable (as a percentage). The results of other categorical variables are as follows:

1. Gender:
High anxiety rate is higher in women with 37.4%.

This rate is lower in men (33.6%).

“Moderate” level anxiety is highest in non-binary individuals with 42%.

2. Education_Level:
High anxiety is around 35-36% in undergraduate and high school graduates.

High anxiety is slightly lower in graduates (33%).

3. Employment_Status:
High anxiety rate stands out in retired individuals with 37.9%.

Other groups are close to each other.

4. Medication_Use:
High anxiety rate is higher in individuals who use medication regularly (37.8%).

This rate is 33.9% in those who never use medication.

5. Substance_Use:
High anxiety in those who never use it is 36.1%.

This rate is lower in frequent users (29.0%).

In [None]:
# Let's draw a bar plot for each categorical variable
for col in categorical_cols:
    counts = df.groupby(col)['Anxiety_Level'].value_counts(normalize=True).unstack().fillna(0)

    counts.plot(kind='bar', stacked=True, figsize=(7, 5))
    plt.title(f'{col}Distribution of Anxiety Levels by')
    plt.ylabel('Oran')
    plt.xlabel(col)
    plt.legend(title='Anxiety Seviyesi')
    plt.tight_layout()
    plt.show()


### Visualizing Anxiety Levels

The above graphs visually represent the distribution of Anxiety_Level for each categorical variable. Highlights:

Gender: High anxiety is more prevalent in women, while low and moderate levels are more balanced in men.

Education_Level: High anxiety rates appear to decrease slightly as education levels increase.

Employment_Status: Retired individuals report higher anxiety than other groups.

Medication_Use: High anxiety rates are notable in those who use regular medications.

Substance_Use: Low anxiety rates are high in frequent substance users — this may require further analysis or interpretation (e.g., it could be a suppression mechanism).

## Depression_Score Groupings

In [None]:
# Let's break down Depression_Score into categories
# Example ranges: 0-6 (Low), 7-13 (Medium), 14+ (High) — common classification
def categorize_depression(score):
    if score <= 6:
        return 'Low'
    elif score <= 13:
        return 'Medium'
    else:
        return 'High'

df['Depression_Level'] = df['Depression_Score'].apply(categorize_depression)

# Let's calculate the Depression_Level distribution according to categorical variables
depression_by_cats = {}

for col in categorical_cols:
    counts = df.groupby(col)['Depression_Level'].value_counts(normalize=True).unstack().fillna(0)
    depression_by_cats[col] = counts



# We will graph other data later
depression_by_cats


### Depression_Score Grouping:

Low: 0–6

Medium: 7–13

High: 14+

Depression_Level rates according to categorical variables are given in the table above. Some noteworthy points:

1. Gender:
The high depression rate is quite high in the Non-binary and Other groups (approximately 39–43%).

There is a more balanced distribution between women and men.

2. Education_Level:
The high depression rate is slightly higher in bachelor's degree graduates (38.8%).

This rate is at its lowest levels in doctorate (PhD) holders (approximately 32%).

3. Employment_Status:
The distribution of depression appears quite similar in all employment statuses.

4. Medication_Use:
The high depression rate is around 37% in individuals who use regular medication.

It is lower in those who do not use any medication at all.

5. Substance_Use:
Moderate and high depression rates are higher in frequent substance users.

In [None]:
# Let's draw Depression_Level bar plots for each categorical variable
for col in categorical_cols:
    counts = df.groupby(col)['Depression_Level'].value_counts(normalize=True).unstack().fillna(0)

    counts.plot(kind='bar', stacked=True, figsize=(7, 5))
    plt.title(f'{col} Göre Depression Seviyesi Dağılımı')
    plt.ylabel('Oran')
    plt.xlabel(col)
    plt.legend(title='Depression Seviyesi')
    plt.tight_layout()
    plt.show()


### Depression Seviyesi Dağılımı – Görselleştirme

Grafiklerde dikkat çeken noktalar:

Gender: Non-binary ve Other gruplarında yüksek depresyon oranı daha fazla görünüyor.

Education_Level: Eğitim seviyesi arttıkça (özellikle PhD'de), yüksek depresyon oranı azalıyor.

Medication_Use: Düzenli ilaç kullanan bireylerde yüksek depresyon oranı belirgin şekilde daha fazla.

Substance_Use: Sık madde kullananlarda “Orta” ve “Yüksek” depresyon seviyeleri öne çıkıyor.

Employment_Status: Tüm kategorilerde dağılım benzer ancak çalışan bireylerde “Düşük” depresyon oranı biraz daha yüksek.



# A Simple Prediction Model Without Feature Engineering

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Let's select the columns to be used for the modelfeatures = df.drop(columns=['Anxiety_Score', 'Depression_Score']) 
target = df['Anxiety_Score']

# Let's separate numeric and categorical variables
numerical_cols = features.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols_model = features.select_dtypes(include=['object', 'category']).columns.tolist()

# One-hot encoder and pipeline definition
preprocessor = ColumnTransformer([
    ('num', 'passthrough', numerical_cols),
    ('cat', OneHotEncoder(drop='first'), categorical_cols_model)
])

# Lineer regresyon pipeline'ı
model_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', LinearRegression())
])


X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)


model_pipeline.fit(X_train, y_train)


y_pred = model_pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, r2


## Comment:
The R² value is 0.89, meaning our model can explain 89% of the variance in the Anxiety_Score variable. This is quite high and indicates a strong linear relationship.

The MSE value is also low, indicating that the model's predictions are close to reality.
