In [1]:
from ff_custom_scripts import *
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import mean_squared_error, r2_score, brier_score_loss, f1_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import Lasso
from sklearn.compose import TransformedTargetRegressor
from imblearn.pipeline import Pipeline as imbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import FunctionTransformer

from imblearn.under_sampling import RandomUnderSampler

from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')


In [2]:
train, test,leaderboard,holdout = load_files(nanvalues='remove')

alldata = pd.concat([train, test]) # all data available for training in the FF Challenge

print(train.shape, test.shape,leaderboard.shape,holdout.shape)

meta = pd.read_csv('../metadata/metadata.csv', index_col=0)

targets = ['gpa','grit','materialHardship','eviction','layoff','jobTraining']

predictors = {target: list(meta[meta[target] != 0].index) for target in targets}


(1172, 10396) (294, 10396) (530, 10396) (803, 10396)


In [3]:
# train, cv, test = load_files(nanvalues='remove')

# alldata = pd.concat([train, cv, test])

# meta = pd.read_csv('metadata.csv', index_col=0)

# targets = ['gpa','grit','materialHardship','eviction','layoff','jobTraining']

# predictors = {target: list(meta[meta[target] != 0].index) for target in targets}

# print(train.shape, cv.shape, test.shape)

In [8]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore')), 
    ('variance', VarianceThreshold(threshold=0.1)),
]
    )

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('variance', VarianceThreshold(threshold=0.1)),
    ])

ordered_transformer = Pipeline(steps=[
        # ('target', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
        ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
        ('variance', VarianceThreshold(threshold=0.1)),
    ])


In [10]:
def run_model(X_train,Y_train,target,crossval=5,classifier=False):

    pred = predictors[target]

    X_train = X_train[pred]
    Y_train = Y_train[target]
                        
    numeric_features = cols_per_type(X_train, 'Continuous')
    categorical_features = cols_per_type(X_train, 'Unordered Categorical')
    binary_features = cols_per_type(X_train, 'Binary')
    ordered_features = cols_per_type(X_train, 'Ordered Categorical')

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features), 
            ('cat', categorical_transformer, categorical_features),
            ('bin', ordered_transformer, binary_features),
            ('ord', ordered_transformer, ordered_features),
            ]
    )

    if classifier:
        
        logreg = LogisticRegression(penalty='l1',solver='liblinear',max_iter=10000)

        classifier = imbPipeline([('preprocessor', preprocessor),
                                     ('smote', SMOTE(random_state=42)),
                                    ('classifier', logreg),])

               
        scores = cross_validate(classifier, X_train, Y_train, cv=crossval, scoring='neg_brier_score',return_estimator=True)

        dummy = DummyClassifier(strategy='most_frequent')
        dummy.fit(X_train, Y_train)  
        Y_pred = dummy.predict(X_train)
        baseline_brier = brier_score_loss(Y_train, Y_pred)

        classifier.fit(X_train, Y_train)

        # # get dictionary of coefficients and features
        # coefs = {}
        
        # for i, feature in enumerate(X_train.columns):
        #     if classifier[-1].coef_[0][i] != 0:
        #         coefs[feature] = classifier[-1].coef_[0][i]
                
        return scores['test_score'], baseline_brier#, coefs

    else:
        
        lasso = Lasso(alpha=1000.0,max_iter=10000)

        regressor = Pipeline(steps=[('preprocessor', preprocessor),        
                                    ('regressor', lasso),])

        model = TransformedTargetRegressor(regressor=regressor, transformer=StandardScaler())

        scores = {'mse': 'neg_mean_squared_error',
                'r2': 'r2'}
        
        scores = cross_validate(model, X_train, Y_train, cv=crossval, scoring=scores,return_estimator=True)
        

        mean = np.mean(Y_train)
        Y_pred = np.full(Y_train.shape, mean)
        baseline_mse = mean_squared_error(Y_train, Y_pred)
        baseline_r2 = r2_score(Y_train, Y_pred)

        model.fit(X_train, Y_train)

        # coefs = {}

        # for i, feature in enumerate(X_train.columns):
        #     if model.regressor_[-1].coef_[i] != 0:
        #         coefs[feature] = model.regressor_[-1].coef_[i]

        # return scores['test_mse'], scores['test_r2'], baseline_mse, baseline_r2#, coefs
        return model

# run_model(X,Y,'eviction',crossval=5,classifier=True)


In [None]:
run_model(train,train,'eviction',crossval=5,classifier=True)

In [6]:
def benchmark():
    ranks = []

    for t in ['gpa','grit','materialHardship']:
        entry = {}
        print(f'Running {t}')
        mse, r2, baseline_mse, baseline_r2 = run_model(X,Y,t)
        entry['target'] = t
        entry['score'] = mse.mean() * -1
        entry['r2'] = r2.mean()
        entry['score_std'] = mse.std()
        entry['r2_std'] = r2.std()
        entry['baseline_score'] = baseline_mse
        entry['baseline_r2'] = baseline_r2
        # entry['coefs'] = coefs
        ranks.append(entry)

    for t in ['eviction','layoff','jobTraining']:
        entry = {}
        print(f'Running {t}')
        brier, baseline_brier = run_model(X,Y,t,classifier=True)
        entry['target'] = t
        entry['score'] = brier.mean() * -1
        entry['score_std'] = brier.std()
        entry['baseline_score'] = baseline_brier
        # entry['coefs'] = coefs
        ranks.append(entry)

    ranks = pd.DataFrame(ranks)

    return ranks

In [7]:
# removing nan values
board = benchmark()

Running gpa


NameError: name 'X' is not defined

In [None]:
board

Unnamed: 0,target,score,r2,score_std,r2_std,baseline_score,baseline_r2
0,gpa,0.394977,0.077777,0.033676,0.021614,0.428469,0.0
1,grit,0.2246,-0.00482,0.025785,0.011696,0.224958,0.0
2,materialHardship,0.021594,0.063545,0.002133,0.030413,0.023115,0.0
3,eviction,0.135518,,0.019218,,0.060158,
4,layoff,0.313445,,0.027306,,0.198225,
5,jobTraining,0.357322,,0.016957,,0.263314,


In [None]:
board.to_csv('benchmark_lasso.csv')

# Explore coefficients

In [None]:
def getfeat_name(feat):
    name = meta[meta.index == feat].varlab.values[0]
    return name

def get_top_features(df, n=10):
    top_features = []
    for col in df.columns:
        entry = {}
        idvars = df[col].sort_values(ascending=False)[:n].index.tolist()
        vars = [getfeat_name(var) for var in idvars]
        entry['target'] = col
        entry['id'] = idvars
        entry['feature'] = vars
        top_features.append(entry)
    
    return pd.DataFrame(top_features)


In [None]:
coefs = pd.DataFrame(board['coefs'].apply(pd.Series)).set_index(board.target).T

In [None]:
# show pandas in full column widt
pd.set_option('display.max_colwidth', None)
get_top_features(coefs).explode(['feature','id'])

Unnamed: 0,target,id,feature
0,gpa,m2a8d,Int Chk: Are mother and father currently romantically involved?
0,gpa,f2b6,# of times since birth child been to health care professional for well-visit?
0,gpa,m1b10a7,"Reason no plan to marry, Relationship reasons ?"
0,gpa,f2l6,Do you or wife/partner own car/truck/van?
0,gpa,f1b16,"During bio mom preg, did you give her money or buy things for the baby/ies?"
0,gpa,f1h2,Were you born in US?
0,gpa,f1b9a2,"Reasons no plan to live together, Timing problem?"
0,gpa,m1b6a,When you and bio dad together how often did you disagree about money?
0,gpa,m1d2g,"How imp for successful marriage, both emotionally mature?"
0,gpa,f1b9a5,"Reasons no plan to live together, Incarceration?"
