In [1]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from xgboost import XGBClassifier, XGBRegressor
from skopt import BayesSearchCV
import skopt.space as space
from sklearn.pipeline import Pipeline

from ff_custom_scripts import *



In [2]:
train, test,leaderboard,holdout = load_files(nanvalues='remove')

alldata = pd.concat([train, test]) # all data available for training in the FF Challenge

print(train.shape, test.shape,leaderboard.shape,holdout.shape)

(1172, 1437) (294, 1437) (530, 1437) (1111, 1437)


# Model

In [3]:
def run_model(train, target='gpa', classifier=False):

    X_train, y_train = prepare_data(train, target)
    
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    numerical_features = cols_per_type(X_train, 'Continuous')
    categorical_features = cols_per_type(X_train, 'Categorical')
    binary_features = cols_per_type(X_train, 'Binary')
    ordinal_features = cols_per_type(X_train, 'Ordinal')
    
    preprocessor = make_column_transformer(
        (categorical_transformer, categorical_features),
        ('passthrough', numerical_features),
        ('passthrough', ordinal_features),
        ('passthrough', binary_features)
    )
    
    search_space = {
            'regressor__n_estimators': space.Integer(100, 1000),
            'regressor__learning_rate': space.Real(0.01, 0.05),
            'regressor__max_depth': space.Integer(2, 5),
            'regressor__subsample': space.Real(0.4, 0.8),
            'regressor__colsample_bytree': space.Real(0.4, 0.8),
        }
    
    if classifier:
        score = 'neg_brier_score'
        xgboost = XGBClassifier(n_jobs=-1, eval_metric='logloss', objective='binary:logistic')
        search_space.update({'regressor__scale_pos_weight': space.Integer(2, 17)})
           
    else:
        xgboost = XGBRegressor(n_jobs=-1, eval_metric='rmse')
        score = 'neg_mean_squared_error'
    
        
    pipes = Pipeline(steps=[('preprocessor', preprocessor),
                            ('regressor', xgboost)])
                            
    model = BayesSearchCV(
        pipes,
        search_space,
        n_iter=10,
        cv=5,
        n_jobs=-1,
        scoring=score,
        refit=True,
        verbose=0,
        random_state=32
    )
    
    model.fit(X_train, y_train)
    
    return model


# GPA

In [4]:
gpa_model = run_model(train,target='gpa', classifier=False)

In [5]:
score_model(gpa_model,'gpa',test,leaderboard,holdout=None,classifier=False)

Metric: mean_squared_error
Best CV score: 0.4059
Standard deviation of CV scores: 0.0411
Mean CV score: 0.4184
Test MSE: 0.4123
Test R2: 0.0884
Leaderboard MSE: 0.3890
Leaderboard R2: 0.0041


# Material Hardship

In [7]:
model_materialHardship = run_model(train,target='materialHardship', classifier=False)

In [8]:
model_materialHardship.best_params_

OrderedDict([('regressor__colsample_bytree', 0.6138721865203991),
             ('regressor__learning_rate', 0.026903038203804172),
             ('regressor__max_depth', 3),
             ('regressor__n_estimators', 282),
             ('regressor__subsample', 0.7385616673608244)])

In [9]:
score_model(model_materialHardship, target='materialHardship', test=test, leaderboard=leaderboard, holdout=None, classifier=False)

Metric: mean_squared_error
Best CV score: 0.0214
Standard deviation of CV scores: 0.0033
Mean CV score: 0.0219
Test MSE: 0.0204
Test R2: 0.1908
Leaderboard MSE: 0.0252
Leaderboard R2: 0.1177


# Grit


In [11]:
model_grit = run_model(train,target='grit', classifier=False)

In [12]:
model_grit.best_params_

OrderedDict([('regressor__colsample_bytree', 0.6138721865203991),
             ('regressor__learning_rate', 0.026903038203804172),
             ('regressor__max_depth', 3),
             ('regressor__n_estimators', 282),
             ('regressor__subsample', 0.7385616673608244)])

In [13]:
score_model(model_grit, target='grit', test=test, leaderboard=leaderboard, holdout=None, classifier=False)

Metric: mean_squared_error
Best CV score: 0.2506
Standard deviation of CV scores: 0.0441
Mean CV score: 0.2648
Test MSE: 0.2242
Test R2: 0.0004
Leaderboard MSE: 0.3231
Leaderboard R2: -0.4703


# Eviction

In [15]:
model_eviction = run_model(train,target='eviction', classifier=True)

In [23]:
model_eviction.best_params_

OrderedDict([('regressor__colsample_bytree', 0.6510334011091358),
             ('regressor__learning_rate', 0.01980905463264343),
             ('regressor__max_depth', 4),
             ('regressor__n_estimators', 772),
             ('regressor__scale_pos_weight', 13),
             ('regressor__subsample', 0.690421136549473)])

In [27]:
score_model(model_eviction, 'eviction', test, leaderboard, holdout=None, classifier=True)

Test Brier: 0.0514
Test F1: 0.0000
Leaderboard Brier: 0.0528
Leaderboard F1: 0.1250


# jobTraining

In [18]:
model_job = run_model(train,target='jobTraining', classifier=True)

In [24]:
model_job.best_params_

OrderedDict([('regressor__colsample_bytree', 0.6573580990269696),
             ('regressor__learning_rate', 0.03839797463244792),
             ('regressor__max_depth', 5),
             ('regressor__n_estimators', 497),
             ('regressor__scale_pos_weight', 8),
             ('regressor__subsample', 0.4089659185057217)])

In [19]:
score_model(model_job, 'jobTraining', test, leaderboard, holdout=None, classifier=True)

Test Brier: 0.2842
Test F1: 0.2906
Leaderboard Brier: 0.3340
Leaderboard F1: 0.1690


# Layoff

In [20]:
model_layoff = run_model(train, target='layoff', classifier=True)

In [21]:
model_layoff.best_params_

OrderedDict([('regressor__colsample_bytree', 0.6573580990269696),
             ('regressor__learning_rate', 0.03839797463244792),
             ('regressor__max_depth', 5),
             ('regressor__n_estimators', 497),
             ('regressor__scale_pos_weight', 8),
             ('regressor__subsample', 0.4089659185057217)])

In [26]:
score_model(model_layoff, 'layoff', test, leaderboard, holdout=None, classifier=True)

Test Brier: 0.2311
Test F1: 0.2368
Leaderboard Brier: 0.2830
Leaderboard F1: 0.1071
