In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('../data/nsduh_sample.csv')
print("Shape:", df.shape)
df.head()

Shape: (10000, 6)


Unnamed: 0,age,gender,income,education,mental_health,drug_use
0,56,0,81407,3,0,0
1,46,0,73821,4,0,0
2,32,1,55690,1,0,0
3,60,0,26654,2,0,0
4,25,0,87557,3,0,0


In [5]:
df.info()
print(df.isnull().sum())
target_col = 'drug_use'
print(df[target_col].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   age            10000 non-null  int64
 1   gender         10000 non-null  int64
 2   income         10000 non-null  int64
 3   education      10000 non-null  int64
 4   mental_health  10000 non-null  int64
 5   drug_use       10000 non-null  int64
dtypes: int64(6)
memory usage: 468.9 KB
age              0
gender           0
income           0
education        0
mental_health    0
drug_use         0
dtype: int64
drug_use
0    8702
1    1298
Name: count, dtype: int64


In [11]:
df = df.dropna(subset=[target_col])
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

In [12]:
X = df.drop(target_col, axis=1)
y = df[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [13]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix

In [15]:
lr = LogisticRegression(class_weight='balanced', random_state=42)
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)
y_prob_lr = lr.predict_proba(X_test_scaled)[:, 1]
print("LR Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

LR Accuracy: 0.8445
              precision    recall  f1-score   support

           0       0.98      0.84      0.90      1740
           1       0.45      0.91      0.60       260

    accuracy                           0.84      2000
   macro avg       0.72      0.87      0.75      2000
weighted avg       0.91      0.84      0.86      2000



In [16]:
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, class_weight='balanced')
rf.fit(X_train_scaled, y_train)
y_pred_rf = rf.predict(X_test_scaled)
y_prob_rf = rf.predict_proba(X_test_scaled)[:, 1]
print("RF Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

RF Accuracy: 0.893
              precision    recall  f1-score   support

           0       0.96      0.91      0.94      1740
           1       0.56      0.78      0.65       260

    accuracy                           0.89      2000
   macro avg       0.76      0.84      0.80      2000
weighted avg       0.91      0.89      0.90      2000



In [17]:
xgb = XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42, scale_pos_weight=len(y_train[y_train==0])/len(y_train[y_train==1]))
xgb.fit(X_train_scaled, y_train)
y_pred_xgb = xgb.predict(X_test_scaled)
y_prob_xgb = xgb.predict_proba(X_test_scaled)[:, 1]
print("XGB Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))

XGB Accuracy: 0.8645
              precision    recall  f1-score   support

           0       0.97      0.87      0.92      1740
           1       0.49      0.84      0.62       260

    accuracy                           0.86      2000
   macro avg       0.73      0.85      0.77      2000
weighted avg       0.91      0.86      0.88      2000

