#### Modeling the YRBSS: Target Q 58; Sex behavior

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn import model_selection 
from sklearn.utils import class_weight
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, f1_score, recall_score

# from scipy.stats import 
# import statsmodels.api as sm

In [2]:
sex = pd.read_csv('data/sex.csv')

In [3]:
sex.q58.unique()

array([0., 1.])

In [4]:
sex.drop(columns='sitename', inplace=True)

In [5]:
sex.q58.value_counts(normalize=True)

0.0    0.623655
1.0    0.376345
Name: q58, dtype: float64

In [33]:
y = sex.pop('q58')

In [34]:
X = sex.copy()

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [36]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((188784, 27), (47196, 27), (188784,), (47196,))

In [37]:
y_train.sum() / len(X_train), y_test.sum() / len(X_test)

(0.3763454530044919, 0.3763454530044919)

Source for the run_experiments code: https://towardsdatascience.com/quickly-test-multiple-models-a98477476f0

In [38]:
def run_experiments(X_train: pd.DataFrame, y_train: pd.DataFrame, 
                   X_test: pd.DataFrame, y_test: pd.DataFrame) -> pd.DataFrame: 
    dfs = []
    models = [
        ('LogReg', LogisticRegression()),
        ('RF', RandomForestClassifier()), 
        ('KNN', KNeighborsClassifier()), 
        ('SVM', SVC()), 
        ('GNB', GaussianNB()), 
        ('XGB', XGBClassifier())
    ]
    results = []
    names = []
    scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 
               'f1_weighted', 'roc_auc']
    target_names = ['Never Had Sex', 'Have Had Sex']
    for name, model in models: 
        kfold = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)
        cv_results = model_selection.cross_validate(model, X_train, y_train, cv=kfold, scoring=scoring)
        clf = model.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print(name)
        print(classification_report(y_test, y_pred, 
                                   target_names = target_names))
        results.append(cv_results)
        this_df = pd.DataFrame(cv_results)
        this_df['model'] = name
        dfs.append(this_df)
    
    final = pd.concat(dfs, ignore_index=True)
    return final

DO NOT RERUN NEXT CELL

In [39]:
# took a looooooong time to run, don't rerun :) 

run_experiments(X_train, y_train, X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

LogReg
               precision    recall  f1-score   support

Never Had Sex       0.76      0.86      0.81     29434
 Have Had Sex       0.71      0.56      0.63     17762

     accuracy                           0.75     47196
    macro avg       0.74      0.71      0.72     47196
 weighted avg       0.75      0.75      0.74     47196

RF
               precision    recall  f1-score   support

Never Had Sex       0.85      0.88      0.87     29434
 Have Had Sex       0.79      0.75      0.77     17762

     accuracy                           0.83     47196
    macro avg       0.82      0.81      0.82     47196
 weighted avg       0.83      0.83      0.83     47196

KNN
               precision    recall  f1-score   support

Never Had Sex       0.77      0.85      0.81     29434
 Have Had Sex       0.69      0.58      0.63     17762

     accuracy                           0.75     47196
    macro avg       0.73      0.71      0.72     47196
 weighted avg       0.74      0.75      0.7

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


SVM
               precision    recall  f1-score   support

Never Had Sex       0.62      1.00      0.77     29434
 Have Had Sex       0.00      0.00      0.00     17762

     accuracy                           0.62     47196
    macro avg       0.31      0.50      0.38     47196
 weighted avg       0.39      0.62      0.48     47196



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


GNB
               precision    recall  f1-score   support

Never Had Sex       0.71      0.89      0.79     29434
 Have Had Sex       0.70      0.40      0.51     17762

     accuracy                           0.71     47196
    macro avg       0.70      0.65      0.65     47196
 weighted avg       0.71      0.71      0.69     47196

























XGB
               precision    recall  f1-score   support

Never Had Sex       0.80      0.85      0.82     29434
 Have Had Sex       0.72      0.64      0.68     17762

     accuracy                           0.77     47196
    macro avg       0.76      0.75      0.75     47196
 weighted avg       0.77      0.77      0.77     47196



Unnamed: 0,fit_time,score_time,test_accuracy,test_precision_weighted,test_recall_weighted,test_f1_weighted,test_roc_auc,model
0,1.197611,0.078982,0.753211,0.749312,0.753211,0.744835,0.815798,LogReg
1,1.25697,0.072492,0.750377,0.74726,0.750377,0.742554,0.817177,LogReg
2,1.323152,0.07641,0.7548,0.750927,0.7548,0.747452,0.815897,LogReg
3,1.365336,0.070918,0.752576,0.748896,0.752576,0.74479,0.818429,LogReg
4,1.184873,0.071686,0.751907,0.747581,0.751907,0.744372,0.815377,LogReg
5,18.44165,2.263808,0.818206,0.816985,0.818206,0.81739,0.888723,RF
6,19.470484,2.328132,0.81855,0.817093,0.81855,0.817271,0.888333,RF
7,19.132283,2.177924,0.818947,0.817565,0.818947,0.817934,0.888986,RF
8,17.576935,2.067484,0.816882,0.815295,0.816882,0.815563,0.888003,RF
9,17.318675,2.014958,0.815129,0.813811,0.815129,0.814233,0.887621,RF


In [40]:
# Initial inspection looks like Random Forest is best bet... 

In [43]:
final = Out[39]