<a href="https://colab.research.google.com/github/ahmdhqnn/KAT-Praktikum/blob/main/Asesmen2/Asesmen2Ahmad_Hakin_Najili_607062300122.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Load dataset
df = pd.read_csv('enhanced_anxiety_dataset.csv')

# --- Exploratory Data Analysis ---

# 1. Check for missing values
print("Missing Values:\n", df.isnull().sum())

# 2. Descriptive statistics
print("\nDescriptive Statistics:\n", df.describe())

# 3. Distribution of Anxiety Level
plt.figure(figsize=(8, 6))
sns.histplot(df['Anxiety Level (1-10)'], bins=10, kde=True)
plt.title('Distribution of Anxiety Level')
plt.xlabel('Anxiety Level (1-10)')
plt.ylabel('Count')
plt.savefig('anxiety_level_distribution.png')
plt.close()

# 4. Correlation heatmap for numeric features
numeric_cols = ['Age', 'Sleep Hours', 'Physical Activity (hrs/week)', 'Caffeine Intake (mg/day)',
                'Alcohol Consumption (drinks/week)', 'Stress Level (1-10)', 'Heart Rate (bpm)',
                'Breathing Rate (breaths/min)', 'Sweating Level (1-5)', 'Therapy Sessions (per month)',
                'Diet Quality (1-10)', 'Anxiety Level (1-10)']
plt.figure(figsize=(12, 8))
sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.savefig('correlation_heatmap.png')
plt.close()

# 5. Boxplot for numeric features to detect outliers
plt.figure(figsize=(12, 6))
sns.boxplot(data=df[['Sleep Hours', 'Caffeine Intake (mg/day)', 'Heart Rate (bpm)']])
plt.title('Boxplot of Selected Numeric Features')
plt.savefig('boxplot_numeric.png')
plt.close()

# 6. Barplot for Anxiety Level by Gender
plt.figure(figsize=(8, 6))
sns.barplot(x='Gender', y='Anxiety Level (1-10)', data=df)
plt.title('Average Anxiety Level by Gender')
plt.savefig('anxiety_by_gender.png')
plt.close()

# 7. Barplot for Anxiety Level by Occupation
plt.figure(figsize=(12, 6))
sns.barplot(x='Occupation', y='Anxiety Level (1-10)', data=df)
plt.xticks(rotation=45)
plt.title('Average Anxiety Level by Occupation')
plt.savefig('anxiety_by_occupation.png')
plt.close()

Missing Values:
 Age                                  0
Gender                               0
Occupation                           0
Sleep Hours                          0
Physical Activity (hrs/week)         0
Caffeine Intake (mg/day)             0
Alcohol Consumption (drinks/week)    0
Smoking                              0
Family History of Anxiety            0
Stress Level (1-10)                  0
Heart Rate (bpm)                     0
Breathing Rate (breaths/min)         0
Sweating Level (1-5)                 0
Dizziness                            0
Medication                           0
Therapy Sessions (per month)         0
Recent Major Life Event              0
Diet Quality (1-10)                  0
Anxiety Level (1-10)                 0
dtype: int64

Descriptive Statistics:
                 Age   Sleep Hours  Physical Activity (hrs/week)  \
count  11000.000000  11000.000000                  11000.000000   
mean      40.241727      6.650691                      2.942136   
st

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# Load dataset
df = pd.read_csv('enhanced_anxiety_dataset.csv')

# --- Machine Learning: Regression ---

# 1. Define features and target
X = df.drop('Anxiety Level (1-10)', axis=1)
y = df['Anxiety Level (1-10)']

# 2. Define categorical and numeric columns
categorical_cols = ['Gender', 'Occupation', 'Smoking', 'Family History of Anxiety',
                    'Dizziness', 'Medication', 'Recent Major Life Event']
numeric_cols = ['Age', 'Sleep Hours', 'Physical Activity (hrs/week)', 'Caffeine Intake (mg/day)',
                'Alcohol Consumption (drinks/week)', 'Stress Level (1-10)', 'Heart Rate (bpm)',
                'Breathing Rate (breaths/min)', 'Sweating Level (1-5)', 'Therapy Sessions (per month)',
                'Diet Quality (1-10)']

# 3. Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols)
    ])

# 4. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Define models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42)
}

# 6. Train and evaluate models
results = {}
for name, model in models.items():
    # Create pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

    # Train model
    pipeline.fit(X_train, y_train)

    # Predict
    y_pred = pipeline.predict(X_test)

    # Evaluate
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    results[name] = {'MAE': mae, 'RMSE': rmse, 'R2': r2}

    # Plot predictions vs actual
    plt.figure(figsize=(8, 6))
    plt.scatter(y_test, y_pred, alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
    plt.xlabel('Actual Anxiety Level')
    plt.ylabel('Predicted Anxiety Level')
    plt.title(f'{name}: Predictions vs Actual')
    plt.savefig(f'{name.lower().replace(" ", "_")}_predictions.png')
    plt.close()

# 7. Print evaluation results
print("\nModel Evaluation Results:")
for name, metrics in results.items():
    print(f"\n{name}:")
    print(f"MAE: {metrics['MAE']:.2f}")
    print(f"RMSE: {metrics['RMSE']:.2f}")
    print(f"R2: {metrics['R2']:.2f}")

# 8. Hyperparameter tuning for XGBoost
xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', XGBRegressor(random_state=42))])
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [3, 5],
    'model__learning_rate': [0.01, 0.1]
}
grid_search = GridSearchCV(xgb_pipeline, param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# 9. Evaluate tuned XGBoost
y_pred_tuned = grid_search.predict(X_test)
mae_tuned = mean_absolute_error(y_test, y_pred_tuned)
rmse_tuned = np.sqrt(mean_squared_error(y_test, y_pred_tuned))
r2_tuned = r2_score(y_test, y_pred_tuned)

print("\nTuned XGBoost Results:")
print(f"Best Parameters: {grid_search.best_params_}")
print(f"MAE: {mae_tuned:.2f}")
print(f"RMSE: {rmse_tuned:.2f}")
print(f"R2: {r2_tuned:.2f}")

# 10. Feature importance for Random Forest
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', RandomForestRegressor(random_state=42))])
rf_pipeline.fit(X_train, y_train)
feature_names = (numeric_cols +
                 rf_pipeline.named_steps['preprocessor']
                 .named_transformers_['cat']
                 .get_feature_names_out(categorical_cols).tolist())
importances = rf_pipeline.named_steps['model'].feature_importances_
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Top 10 Feature Importance (Random Forest)')
plt.savefig('feature_importance.png')
plt.close()


Model Evaluation Results:

Linear Regression:
MAE: 0.89
RMSE: 1.13
R2: 0.73

Random Forest:
MAE: 0.82
RMSE: 1.02
R2: 0.78

XGBoost:
MAE: 0.86
RMSE: 1.07
R2: 0.75

Tuned XGBoost Results:
Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 100}
MAE: 0.81
RMSE: 1.00
R2: 0.78
