In [87]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pathlib import Path

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import cohen_kappa_score, make_scorer

from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

pd.set_option('display.max_columns', None, 'display.max_rows', None)

In [85]:
imagepath = Path('../images')
datapath = Path('../data')
full = 'cleaned_full_1.csv'
std_feats = 'high_std_feats.csv'

In [31]:
df = pd.read_csv(datapath/full)

In [45]:
# Lists of individual sections

moa = [c for c in df.columns if 'moa' in c]
idea = [c for c in df.columns if 'idea' in c]
swb = [c for c in df.columns if 'swb' in c]
mindful = [c for c in df.columns if 'mindful' in c]
belong = [c for c in df.columns if 'belong' in c]
efficacy = [c for c in df.columns if 'efficacy' in c]
support = [c for c in df.columns if 'support' in c]
transgres = [c for c in df.columns if 'transgres' in c]
exploit = [c for c in df.columns if 'exploit' in c]
stress = [c for c in df.columns if 'stress' in c]
marriage = [c for c in df.columns if 'marriage' in c]

# demographics
socmedia = [c for c in df.columns if 'socmedia' in c]
usdream = [c for c in df.columns if 'usdream' in c]
demo = [c for c in df.columns if 'demo' in c]
disability = [c for c in df.columns if 'disability' in c]
phys = [c for c in df.columns if 'phys' in c]

# Cols to use as features
skip = ['bias', 'duration']
mind_feats = [c for c in mindful if not any(s in c for s in skip)]
sup_feats = [c for c in support if not any(s in c for s in skip)]

sibs = [c for c in demo if 'sibs' in c]
parents = [c for c in demo if 'parents' in c]
fam_feats = parents+sibs

feat_lists = [mind_feats, sup_feats, fam_feats]

# Cols to use as targets
targets = [c for c in df.columns if 'target' in c] + ['belong_now', 'demo_politics']


In [70]:
df[targets].head(2)

Unnamed: 0,swb_target,efficacy_target,phys_target,stress_target,belong_now,demo_politics
0,neutral,very_high,med,high,4.0,2.0
1,neutral,very_high,med,high,4.0,1.0


In [88]:
kappa_scorer = make_scorer(cohen_kappa_score)

In [127]:
@ignore_warnings(category=ConvergenceWarning)
def logreg_tests(df, feat_lists, targets):
    for feat_list in feat_lists:
        f = feat_list[0]
        for target in targets:
            X = StandardScaler().fit_transform(df[feat_list])
            y = df[target].values

            model = LogisticRegression(solver='lbfgs', multi_class='auto')
            score = cross_val_score(model, X, y, cv=5, scoring=kappa_scorer, n_jobs=-1)
            score = np.mean(score)
            
            if score >= 0.2:
                print(f'{f} predicting {target}\nkappa score: {score}\n')

In [128]:
logreg_tests(df, feat_lists, targets)



support_1 predicting swb_target
kappa score: 0.28369913477926445





In [120]:
def rf_tests(df, feat_lists, targets):
    for feat_list in feat_lists:
        f = feat_list[0]
        for target in targets:
            X = df[feat_list].values
            y = df[target].values

            model = RandomForestClassifier(n_estimators=100)
            score = cross_val_score(model, X, y, cv=5, scoring=kappa_scorer, n_jobs=-1)
            score = np.mean(score)
            
            if score >= 0.2:
                print(f'{f} predicting {target}\nkappa score: {score}\n')

In [121]:
rf_tests(df, feat_lists, targets)



support_1 predicting swb_target
kappa score: 0.21183033262782466





In [122]:
def gb_tests(df, feat_lists, targets):
    for feat_list in feat_lists:
        f = feat_list[0]
        for target in targets:
            X = df[feat_list].values
            y = df[target].values

            model = GradientBoostingClassifier(learning_rate=0.01, n_estimators=100)
            score = cross_val_score(model, X, y, cv=5, scoring=kappa_scorer, n_jobs=-1)
            score = np.mean(score)
            
            if score >= 0.2:
                print(f'{f} predicting {target}\nkappa score: {score}\n')

In [123]:
gb_tests(df, feat_lists, targets)

mindful_1 predicting efficacy_target
kappa score: 0.22552969963516972





support_1 predicting swb_target
kappa score: 0.26775235724295954





In [108]:
std_df = pd.read_csv(datapath/std_feats)

In [115]:
high_std = [c for c in std_df.columns if 'swb' not in c]

In [124]:
logreg_tests(df, [high_std], targets)

moa1#1_4 predicting efficacy_target
kappa score: 0.20505141939339921



