In [83]:
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer, TransformedTargetRegressor
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.dummy import DummyClassifier
from sklearn.metrics import mean_squared_error, r2_score, brier_score_loss, f1_score
from sklearn.model_selection import cross_validate
from xgboost import XGBClassifier, XGBRegressor
import shap
from skopt import BayesSearchCV
import skopt.space as space
import skopt.plots as plots
import imblearn
from imblearn.pipeline import Pipeline as imbPipeline
from imblearn.over_sampling import SMOTE
from ff_custom_scripts import *
# import RandomForestRegressor and selectfrom model
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
# import knn imputer
from sklearn.impute import KNNImputer
# from sklearn import SimpleImputer
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score


In [84]:
def score_model(model, target, test, leaderboard, holdout, classifier=False):
    # Get CV score
    score = -model.best_score_
    print(f'Best CV score: {score:.4f}')
    
    # Get mean CV score
    mean_score = -model.cv_results_['mean_test_score'].mean()
    print(f'Mean CV score: {mean_score:.4f}')
    
    # Prepare test data
    X_test, y_test = prepare_data(test, target)

    if classifier:
        # Compute test scores
        y_pred = model.predict(X_test)
        brier = brier_score_loss(y_test, y_pred)
        f1 = f1_score(y_test, y_pred.round())
        
        # Print test scores
        print(f'Test Brier: {brier:.4f}')
        print(f'Test F1: {f1:.4f}')

        # # Compute leaderboard scores
        X_leaderboard, y_leaderboard = prepare_data(leaderboard, target)
        y_pred = model.predict(X_leaderboard)
        brier = brier_score_loss(y_leaderboard, y_pred)
        f1 = f1_score(y_leaderboard, y_pred.round())

        # # Print leaderboard scores
        print(f'Leaderboard Brier: {brier:.4f}')
        print(f'Leaderboard F1: {f1:.4f}')

        # # Compute holdout scores
        X_holdout, y_holdout = prepare_data(holdout, target)
        y_pred = model.predict(X_holdout)
        y_holdout = y_holdout.astype(int)
        brier = brier_score_loss(y_holdout, y_pred)
        print(f'Holdout Brier: {brier:.4f}')
        
        
    else:
        # Compute test scores
        mse = mean_squared_error(y_test, model.predict(X_test))
        rsquared = r2_score(y_test, model.predict(X_test))
        
        # Print test scores
        print(f'Test MSE: {mse:.4f}')
        print(f'Test R2: {rsquared:.4f}')

        # # Compute leaderboard scores
        X_leaderboard, y_leaderboard = prepare_data(leaderboard, target)
        mse = mean_squared_error(y_leaderboard, model.predict(X_leaderboard))
        rsquared = r2_score(y_leaderboard, model.predict(X_leaderboard))

        # Print leaderboard scores
        print(f'Leaderboard MSE: {mse:.4f}')
        print(f'Leaderboard R2: {rsquared:.4f}')

        # # Compute holdout scores
        X_holdout, y_holdout = prepare_data(holdout, target)
        # X_holdout_transformed = model.best_estimator_.named_steps['preprocessor'].transform(X_holdout)
        mse = mean_squared_error(y_holdout, model.predict(X_holdout))
        rsquared = r2_score(y_holdout, model.predict(X_holdout))

        # Print holdout scores
        print(f'Holdout MSE: {mse:.4f}')
        print(f'Holdout R2: {rsquared:.4f}')
        

In [85]:
train, test,leaderboard,holdout = load_files(nanvalues='remove')

alldata = pd.concat([train, test]) # all data available for training in the FF Challenge

print(train.shape, test.shape,leaderboard.shape,holdout.shape)

meta = pd.read_csv('../metadata/metadata.csv', index_col=0)


(1172, 10396) (294, 10396) (530, 10396) (803, 10396)


In [86]:
prepare_data(holdout,'layoff')[1]

challengeID
4       False
11       True
15      False
17       True
29      False
        ...  
4223    False
4226    False
4228    False
4230    False
4237    False
Name: layoff, Length: 803, dtype: object

In [87]:
meta.head(5)

Unnamed: 0_level_0,varlab,type,one_topic,dtype,gpa,grit,materialHardship,eviction,layoff,jobTraining
new_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
cf1lenhr,What was the total length of interview - Hours,Continuous,paradata_and_weights,float64,0.0,0.009892,0.003785,0.0,0.00781,0.0
cf1lenmin,What was the total length of interview - Minutes,Continuous,paradata_and_weights,float64,0.0,0.0,0.0,0.001409,0.0,0.0
cf1fint,Constructed - Was father interviewed at baseline?,Binary,paradata_and_weights,object,0.021894,0.018859,0.016743,0.028699,0.0,0.0
cf1citsm,Constructed - Baseline city sample flag,Binary,paradata_and_weights,object,0.012485,0.023336,0.011364,0.0,0.01659,0.004293
f1citywt,Father baseline city sample weight (20-cities ...,Continuous,paradata_and_weights,float64,0.0,0.0,0.0,0.009234,0.002574,0.014284


In [88]:
targets = ['gpa','grit','materialHardship','eviction','layoff','jobTraining']

predictors = {target: list(meta[meta[target] != 0].index) for target in targets}

In [89]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore')), 
    ('variance', VarianceThreshold(threshold=0.1)),
]
    )

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('variance', VarianceThreshold(threshold=0.1)),
    ])

ordered_transformer = Pipeline(steps=[
        # ('target', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
        ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
        ('variance', VarianceThreshold(threshold=0.1)),
    ])


# Model

In [90]:
def run_model(train, target='gpa', classifier=False):
    X_train, y_train = prepare_data(train, target)
    
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')
    numerical_features = cols_per_type(X_train, 'Continuous')
    categorical_features = cols_per_type(X_train, 'Categorical')
    binary_features = cols_per_type(X_train, 'Binary')
    ordinal_features = cols_per_type(X_train, 'Ordinal')

    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])
    
    preprocessor = make_column_transformer(
        (categorical_transformer, categorical_features),
        (numerical_transformer, numerical_features),
        (ordered_transformer, ordinal_features),
        (ordered_transformer, binary_features)
    )

    search_space = {
        'regressor__max_iter': space.Integer(8_000, 20_000),
    }
    
    if classifier:
        classifier = LogisticRegression(penalty='l2')
        score = 'neg_brier_score'
           
    else:
        classifier = Lasso()
        score = 'neg_mean_squared_error'
        search_space.update({
            'regressor__alpha': space.Real(800, 2000),
        })
    
    pipes = Pipeline(steps=[('preprocessor', preprocessor),
                            ('regressor', classifier)])
                                           
    model = BayesSearchCV(
        pipes,
        search_space,
        n_iter=10,
        cv=5,
        n_jobs=-1,
        scoring=score,
        refit=True,
        verbose=0,
        random_state=42
    )


    # X_cv = preprocessor.fit_transform(X_cv)

    # model.fit(X_train, y_train, regressor__eval_set=[(X_cv, y_cv)], regressor__verbose=True)
    
    model.fit(X_train, y_train)
    
    return model

In [91]:
def shap_show(model, alldata, target, n=5):
    X, y = prepare_data(alldata, target)
    model  = model.best_estimator_.fit(X, y)
    Xtransform = model.named_steps['preprocessor'].transform(X)
    exp = shap.TreeExplainer(model.named_steps['regressor'])
    transformer = model.named_steps['preprocessor']
    names = transformer.get_feature_names_out()
    featnames = [splitfeatname(name) for name in names]
    shap_values = exp.shap_values(Xtransform)
    # get top n features
    top_n_idx = np.argsort(np.abs(shap_values).mean(0))[-n:]
    top_n_feat = [featnames[i] for i in top_n_idx]
    # # get questions
    top_n_vars = [meta[meta.index.isin([feat])].varlab.values for feat in top_n_feat]
    # # reverse order
    top_n_vars = top_n_vars[::-1]
    shap.summary_plot(shap_values, Xtransform, max_display=n, feature_names=featnames)
    return dict(zip(map(tuple, top_n_vars), top_n_feat))


# GPA

In [92]:
gpa_model = run_model(train,target='gpa', classifier=False)

In [93]:
# def score_model(model, target, test, leaderboard, holdout, classifier=False):
#     # Get CV score
#     score = -model.best_score_
#     print(f'Best CV score: {score:.4f}')
    
#     # Get mean CV score
#     mean_score = -model.cv_results_['mean_test_score'].mean()
#     print(f'Mean CV score: {mean_score:.4f}')
    
#     # Prepare test data
#     X_test, y_test = prepare_data(test, target)

#     if classifier:
#         # Compute test scores
#         y_pred = model.predict(X_test)
#         brier = brier_score_loss(y_test, y_pred)
#         f1 = f1_score(y_test, y_pred.round())
        
#         # Print test scores
#         print(f'Test Brier: {brier:.4f}')
#         print(f'Test F1: {f1:.4f}')

#         # # Compute leaderboard scores
#         X_leaderboard, y_leaderboard = prepare_data(leaderboard, target)
#         y_pred = model.predict(X_leaderboard)
#         brier = brier_score_loss(y_leaderboard, y_pred)
#         f1 = f1_score(y_leaderboard, y_pred.round())

#         # # Print leaderboard scores
#         print(f'Leaderboard Brier: {brier:.4f}')
#         print(f'Leaderboard F1: {f1:.4f}')

#         # # Compute holdout scores
#         X_holdout, y_holdout = prepare_data(holdout, target)
#         y_pred = model.predict(X_holdout)
#         brier = brier_score_loss(y_holdout, y_pred)

#         # Print holdout scores
#         print(f'Holdout Brier: {brier:.4f}')
        
#     else:
#         # Compute test scores
#         mse = mean_squared_error(y_test, model.predict(X_test))
#         rsquared = r2_score(y_test, model.predict(X_test))
        
#         # Print test scores
#         print(f'Test MSE: {mse:.4f}')
#         print(f'Test R2: {rsquared:.4f}')

#         # # Compute leaderboard scores
#         X_leaderboard, y_leaderboard = prepare_data(leaderboard, target)
#         mse = mean_squared_error(y_leaderboard, model.predict(X_leaderboard))
#         rsquared = r2_score(y_leaderboard, model.predict(X_leaderboard))

#         # Print leaderboard scores
#         print(f'Leaderboard MSE: {mse:.4f}')
#         print(f'Leaderboard R2: {rsquared:.4f}')

#         # # Compute holdout scores
#         X_holdout, y_holdout = prepare_data(holdout, target)
#         mse = mean_squared_error(y_holdout, model.predict(X_holdout))
#         rsquared = r2_score(y_holdout, model.predict(X_holdout))

#         # Print holdout scores
#         print(f'Holdout MSE: {mse:.4f}')
#         print(f'Holdout R2: {rsquared:.4f}')
        
# score_model(gpa_model,'gpa',test,leaderboard,holdout,classifier=False)

# Material Hardship

In [94]:
model_materialHardship = run_model(train,target='materialHardship', classifier=False)

In [95]:
# OrderedDict([('regressor__gamma', 0.17995534416071732),
#              ('regressor__learning_rate', 0.04942262677968311),
#              ('regressor__max_depth', 8),
#              ('regressor__n_estimators', 1001)])

# OrderedDict([('regressor__gamma', 0.18373883555532844),
#              ('regressor__learning_rati e', 0.07183206941666036),
#              ('regressor__max_depth', 7),
#              ('regressor__min_child_weight', 5),
#              ('regressor__n_estimators', 1105)])

model_materialHardship.best_params_

OrderedDict([('regressor__alpha', 1292.1247506239765),
             ('regressor__max_iter', 16733)])

In [96]:
score_model(model_materialHardship, target='materialHardship', test=test, leaderboard=leaderboard, holdout=holdout, classifier=False)

Best CV score: 0.0240
Mean CV score: 0.0240
Test MSE: 0.0253
Test R2: -0.0000
Leaderboard MSE: 0.0288
Leaderboard R2: -0.0070
Holdout MSE: 0.0244
Holdout R2: -0.0017


# Grit


In [97]:
model_grit = run_model(train,target='grit', classifier=False)

In [98]:
model_grit.best_params_

OrderedDict([('regressor__alpha', 1292.1247506239765),
             ('regressor__max_iter', 16733)])

In [99]:
score_model(model_grit, target='grit', test=test, leaderboard=leaderboard, holdout=holdout, classifier=False)

Best CV score: 0.2363
Mean CV score: 0.2363
Test MSE: 0.2321
Test R2: -0.0047
Leaderboard MSE: 0.2202
Leaderboard R2: -0.0022
Holdout MSE: 0.2394
Holdout R2: -0.0020


In [100]:
meta[meta.index=='m5a8f01']

Unnamed: 0_level_0,varlab,type,one_topic,dtype,gpa,grit,materialHardship,eviction,layoff,jobTraining
new_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
m5a8f01,A8F. Your relationship with the father at biol...,Ordered Categorical,romantic_relationships,category,0.0,0.0,0.026602,0.0,0.0,0.0


# Eviction

In [101]:
model_eviction = run_model(train,target='eviction', classifier=True)

In [102]:
score_model(model_eviction,'eviction', test, leaderboard, holdout,classifier=True)

Best CV score: 0.0721
Mean CV score: 0.0721
Test Brier: 0.0890
Test F1: 0.0714
Leaderboard Brier: 0.1019
Leaderboard F1: 0.0357
Holdout Brier: 0.1071


# jobTraining

In [103]:
model_job = run_model(train,target='jobTraining', classifier=True)

In [104]:
score_model(model_job, 'jobTraining', test, leaderboard, holdout,classifier=True)

Best CV score: 0.3075
Mean CV score: 0.3075
Test Brier: 0.3390
Test F1: 0.2205
Leaderboard Brier: 0.3792
Leaderboard F1: 0.1862
Holdout Brier: 0.3711


# Layoff

In [105]:
model_layoff = run_model(train, target='layoff', classifier=True)

In [106]:
model_layoff.best_params_

OrderedDict([('regressor__max_iter', 12921)])

In [107]:
score_model(model_layoff, 'layoff', test, leaderboard, holdout, classifier=True)

Best CV score: 0.2715
Mean CV score: 0.2715
Test Brier: 0.3605
Test F1: 0.1913
Leaderboard Brier: 0.3264
Leaderboard F1: 0.2700
Holdout Brier: 0.3537
