In [1]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from skopt import BayesSearchCV
import skopt.space as space
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.pipeline import Pipeline


In [2]:
from ff_custom_scripts import *

train, test,leaderboard,holdout = load_files(nanvalues='remove')

alldata = pd.concat([train, test]) # all data available for training in the FF Challenge

print(train.shape, test.shape,leaderboard.shape,holdout.shape)

(1172, 1437) (294, 1437) (530, 1437) (1111, 1437)


# Model

In [3]:
def run_model(train, target='gpa', classifier=False):

    X_train, y_train = prepare_data(train, target)
    print(X_train.shape, y_train.shape)
    
    print(X_train.shape, y_train.shape)

    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    numerical_features = cols_per_type(X_train, 'Continuous')
    print(f'N. of numerical features: {len(numerical_features)}')
    categorical_features = cols_per_type(X_train, 'Ordered Categorical')
    print(f'N. of categorical features: {len(categorical_features)}')
    binary_features = cols_per_type(X_train, 'Binary')
    print(f'N. of binary features: {len(binary_features)}')
    ordinal_features = cols_per_type(X_train, 'Unordered Categorical')
    print(f'N. of ordinal features: {len(ordinal_features)}')

    preprocessor = make_column_transformer(
        (categorical_transformer, categorical_features),
        ('passthrough', numerical_features),
        ('passthrough', ordinal_features),
        ('passthrough', binary_features)
    )

    search_space = {
        'regressor__learning_rate': space.Real(0.01, 0.06),
        'regressor__max_depth': space.Integer(5, 10),
    }

    if classifier:
        score = 'neg_brier_score'
        lgbm = LGBMClassifier()
        search_space.update({'regressor__scale_pos_weight': space.Integer(2, 17)})
    else:
        lgbm = LGBMRegressor()
        score = 'neg_mean_squared_error'

    pipes = Pipeline(steps=[('preprocessor', preprocessor),
                            ('regressor', lgbm)])

    model = BayesSearchCV(
        pipes,
        search_space,
        n_iter=10,
        cv=5,
        n_jobs=-1,
        scoring=score,
        refit=True,
        verbose=0,
        random_state=41
    )

    # ytrain shape
    model.fit(X_train, y_train)

    return model


# GPA

In [4]:
gpa_model = run_model(train,target='gpa', classifier=False)

(929, 1431) (929,)
(929, 1431) (929,)
N. of numerical features: 135
N. of categorical features: 469
N. of binary features: 692
N. of ordinal features: 135


In [5]:
score_model(gpa_model,'gpa',test,leaderboard,holdout=None,classifier=False)

Metric: mean_squared_error
Best CV score: 0.3986
Standard deviation of CV scores: 0.0332
Mean CV score: 0.4077
Test MSE: 0.3974
Test R2: 0.1212
Leaderboard MSE: 0.3952
Leaderboard R2: -0.0117


In [6]:
# shap_show(gpa_model, alldata, target='gpa')

# Material Hardship

In [7]:
model_materialHardship = run_model(train,target='materialHardship', classifier=False)

(1167, 1431) (1167,)
(1167, 1431) (1167,)
N. of numerical features: 135
N. of categorical features: 469
N. of binary features: 692
N. of ordinal features: 135


In [8]:
model_materialHardship.best_params_

OrderedDict([('regressor__learning_rate', 0.022288208566756673),
             ('regressor__max_depth', 7)])

In [9]:
score_model(model_materialHardship, target='materialHardship', test=test, leaderboard=leaderboard, holdout=None, classifier=False)

Metric: mean_squared_error
Best CV score: 0.0216
Standard deviation of CV scores: 0.0037
Mean CV score: 0.0219
Test MSE: 0.0208
Test R2: 0.1770
Leaderboard MSE: 0.0248
Leaderboard R2: 0.1317


# Grit


In [11]:
model_grit = run_model(train,target='grit', classifier=False)

(1134, 1431) (1134,)
(1134, 1431) (1134,)
N. of numerical features: 135
N. of categorical features: 469
N. of binary features: 692
N. of ordinal features: 135


In [12]:
model_grit.best_params_

OrderedDict([('regressor__learning_rate', 0.012104671856054891),
             ('regressor__max_depth', 10)])

In [13]:
score_model(model_grit, target='grit', test=test, leaderboard=leaderboard, holdout=None, classifier=False)

Metric: mean_squared_error
Best CV score: 0.2378
Standard deviation of CV scores: 0.0412
Mean CV score: 0.2452
Test MSE: 0.2217
Test R2: 0.0114
Leaderboard MSE: 0.2232
Leaderboard R2: -0.0160


# Eviction

In [14]:
model_eviction = run_model(train,target='eviction', classifier=True)

(1167, 1431) (1167,)
(1167, 1431) (1167,)
N. of numerical features: 135
N. of categorical features: 469
N. of binary features: 692
N. of ordinal features: 135


In [24]:
model_eviction.best_params_

OrderedDict([('regressor__learning_rate', 0.012104671856054891),
             ('regressor__max_depth', 10),
             ('regressor__scale_pos_weight', 12)])

In [15]:
score_model(model_eviction, 'eviction', test, leaderboard, holdout=None, classifier=True)

Test Brier: 0.0514
Test F1: 0.0000
Leaderboard Brier: 0.0566
Leaderboard F1: 0.0000


# jobTraining

In [17]:
model_job = run_model(train,target='jobTraining', classifier=True)

(1169, 1431) (1169,)
(1169, 1431) (1169,)
N. of numerical features: 135
N. of categorical features: 469
N. of binary features: 692
N. of ordinal features: 135


In [18]:
score_model(model_job, 'jobTraining', test, leaderboard, holdout=None, classifier=True)

Test Brier: 0.2500
Test F1: 0.3178
Leaderboard Brier: 0.3057
Leaderboard F1: 0.2430


# Layoff

In [20]:
model_layoff = run_model(train, target='layoff', classifier=True)

(1026, 1431) (1026,)
(1026, 1431) (1026,)
N. of numerical features: 135
N. of categorical features: 469
N. of binary features: 692
N. of ordinal features: 135


In [21]:
model_layoff.best_params_

OrderedDict([('regressor__learning_rate', 0.022288208566756673),
             ('regressor__max_depth', 7),
             ('regressor__scale_pos_weight', 3)])

In [22]:
score_model(model_layoff, 'layoff', test, leaderboard, holdout=None, classifier=True)

Test Brier: 0.1992
Test F1: 0.0741
Leaderboard Brier: 0.2340
Leaderboard F1: 0.0462
