In [1]:
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer, TransformedTargetRegressor
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.dummy import DummyClassifier
from sklearn.metrics import mean_squared_error, r2_score, brier_score_loss, f1_score
from sklearn.model_selection import cross_validate
from xgboost import XGBClassifier, XGBRegressor
import shap
from skopt import BayesSearchCV
import skopt.space as space
import skopt.plots as plots
import imblearn
from imblearn.pipeline import Pipeline as imbPipeline
from imblearn.over_sampling import SMOTE
from ff_custom_scripts import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score


In [2]:
def score_model(model, target, test, leaderboard, holdout, classifier=False):
    # Get CV score
    score = -model.best_score_
    print(f'Best CV score: {score:.4f}')
    
    # Get mean CV score
    mean_score = -model.cv_results_['mean_test_score'].mean()
    print(f'Mean CV score: {mean_score:.4f}')
    
    # Prepare test data
    X_test, y_test = prepare_data(test, target)

    if classifier:
        # Compute test scores
        y_pred = model.predict(X_test)
        brier = brier_score_loss(y_test, y_pred)
        f1 = f1_score(y_test, y_pred.round())
        
        # Print test scores
        print(f'Test Brier: {brier:.4f}')
        print(f'Test F1: {f1:.4f}')

        # # Compute leaderboard scores
        X_leaderboard, y_leaderboard = prepare_data(leaderboard, target)
        y_pred = model.predict(X_leaderboard)
        brier = brier_score_loss(y_leaderboard, y_pred)
        f1 = f1_score(y_leaderboard, y_pred.round())

        # # Print leaderboard scores
        print(f'Leaderboard Brier: {brier:.4f}')
        print(f'Leaderboard F1: {f1:.4f}')

        # # # Compute holdout scores
        # X_holdout, y_holdout = prepare_data(holdout, target)
        # y_pred = model.predict(X_holdout)
        # y_holdout = y_holdout.astype(int)
        # brier = brier_score_loss(y_holdout, y_pred)
        # print(f'Holdout Brier: {brier:.4f}')
        
        
    else:
        # Compute test scores
        mse = mean_squared_error(y_test, model.predict(X_test))
        rsquared = r2_score(y_test, model.predict(X_test))
        
        # Print test scores
        print(f'Test MSE: {mse:.4f}')
        print(f'Test R2: {rsquared:.4f}')

        # # Compute leaderboard scores
        X_leaderboard, y_leaderboard = prepare_data(leaderboard, target)
        mse = mean_squared_error(y_leaderboard, model.predict(X_leaderboard))
        rsquared = r2_score(y_leaderboard, model.predict(X_leaderboard))

        # Print leaderboard scores
        print(f'Leaderboard MSE: {mse:.4f}')
        print(f'Leaderboard R2: {rsquared:.4f}')

        # # # Compute holdout scores
        # X_holdout, y_holdout = prepare_data(holdout, target)
        # # X_holdout_transformed = model.best_estimator_.named_steps['preprocessor'].transform(X_holdout)
        # mse = mean_squared_error(y_holdout, model.predict(X_holdout))
        # # rsquared = r2_score(y_holdout, model.predict(X_holdout))

        # # Print holdout scores
        # print(f'Holdout MSE: {mse:.4f}')
        # print(f'Holdout R2: {rsquared:.4f}')
        

In [3]:
train, test,leaderboard,holdout = load_files(nanvalues='remove')

alldata = pd.concat([train, test]) # all data available for training in the FF Challenge

print(train.shape, test.shape,leaderboard.shape,holdout.shape)

meta = pd.read_csv('../metadata/metadata.csv', index_col=0)

targets = ['gpa','grit','materialHardship','eviction','layoff','jobTraining']

predictors = {target: list(meta[meta[target] != 0].index) for target in targets}


(1172, 10396) (294, 10396) (530, 10396) (803, 10396)


# Model

In [4]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore')), 
]
    )

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ])

ordered_transformer = Pipeline(steps=[
        ('target', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1,encoded_missing_value=-1)),
    ])


# Model

In [5]:
def run_model(train, target='gpa', classifier=False):
    X_train, y_train = prepare_data(train, target)

    categorical_transformer = OneHotEncoder(handle_unknown='ignore')
    numerical_features = cols_per_type(X_train, 'Continuous')
    categorical_features = cols_per_type(X_train, 'Categorical')
    binary_features = cols_per_type(X_train, 'Binary')
    ordinal_features = cols_per_type(X_train, 'Ordinal')

    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])
    
    preprocessor = make_column_transformer(
        (categorical_transformer, categorical_features),
        (numerical_transformer, numerical_features),
        (ordered_transformer, ordinal_features),
        (ordered_transformer, binary_features)
    )

    search_space = {
        'regressor__max_iter': space.Integer(8_000, 20_000),
    }
    
    if classifier:
        model = LogisticRegression(penalty='l2')
        score = 'neg_brier_score'
        pipes = imbPipeline(steps=[('preprocessor', preprocessor),
                                      ('smote', SMOTE(random_state=42)),
                            ('regressor', model)])
           
    else:
        model = Lasso()
        score = 'neg_mean_squared_error'
        search_space.update({
            'regressor__alpha': space.Real(800, 1500),
        })

        pipes = Pipeline(steps=[('preprocessor', preprocessor),
                            ('regressor', model)])
    
    
                                           
    model = BayesSearchCV(
        pipes,
        search_space,
        n_iter=10,
        cv=5,
        n_jobs=-1,
        scoring=score,
        refit=True,
        verbose=0,
        random_state=42
    )


    model.fit(X_train, y_train)
    
    return model

# GPA

In [6]:
gpa_model = run_model(train,target='gpa', classifier=False)

In [7]:
score_model(gpa_model,'gpa',test,leaderboard,holdout,classifier=False)

Best CV score: 0.4355
Mean CV score: 0.4355
Test MSE: 0.4802
Test R2: -0.0071
Leaderboard MSE: 0.3939
Leaderboard R2: -0.0084


# Material Hardship

In [8]:
model_materialHardship = run_model(train,target='materialHardship', classifier=False)

In [9]:
model_materialHardship.best_params_

OrderedDict([('regressor__alpha', 1087.0727711973198),
             ('regressor__max_iter', 16733)])

In [10]:
score_model(model_materialHardship, target='materialHardship', test=test, leaderboard=leaderboard, holdout=holdout, classifier=False)

Best CV score: 0.0240
Mean CV score: 0.0240
Test MSE: 0.0253
Test R2: -0.0000
Leaderboard MSE: 0.0288
Leaderboard R2: -0.0070


# Grit


In [11]:
model_grit = run_model(train,target='grit', classifier=False)

In [12]:
model_grit.best_params_

OrderedDict([('regressor__alpha', 1087.0727711973198),
             ('regressor__max_iter', 16733)])

In [13]:
score_model(model_grit, target='grit', test=test, leaderboard=leaderboard, holdout=holdout, classifier=False)

Best CV score: 0.2363
Mean CV score: 0.2363
Test MSE: 0.2321
Test R2: -0.0047
Leaderboard MSE: 0.2202
Leaderboard R2: -0.0022


# Eviction

In [14]:
model_eviction = run_model(train,target='eviction', classifier=True)

In [15]:
score_model(model_eviction,'eviction', test, leaderboard, holdout,classifier=True)

Best CV score: 0.0739
Mean CV score: 0.0739
Test Brier: 0.0856
Test F1: 0.0741
Leaderboard Brier: 0.0925
Leaderboard F1: 0.0755


# jobTraining

In [16]:
model_job = run_model(train,target='jobTraining', classifier=True)

In [17]:
score_model(model_job, 'jobTraining', test, leaderboard, holdout,classifier=True)

Best CV score: 0.3105
Mean CV score: 0.3105
Test Brier: 0.3219
Test F1: 0.2769
Leaderboard Brier: 0.3660
Leaderboard F1: 0.2240


# Layoff

In [18]:
model_layoff = run_model(train, target='layoff', classifier=True)

In [19]:
model_layoff.best_params_

OrderedDict([('regressor__max_iter', 12921)])

In [20]:
score_model(model_layoff, 'layoff', test, leaderboard, holdout, classifier=True)

Best CV score: 0.2641
Mean CV score: 0.2641
Test Brier: 0.3566
Test F1: 0.1930
Leaderboard Brier: 0.3151
Leaderboard F1: 0.2707
