In [1]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LogisticRegression, Lasso
from skopt import BayesSearchCV
import skopt.space as space
from imblearn.pipeline import Pipeline as imbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer

In [2]:
from ff_custom_scripts import *

train, test,leaderboard,holdout = load_files(nanvalues='keep')

meta = pd.read_csv('../metadata/metadata.csv', index_col=0)

print(train.shape, test.shape,leaderboard.shape,holdout.shape)

(1172, 1437) (294, 1437) (530, 1437) (803, 1437)


# Model

In [3]:
def run_model(train, target='gpa', classifier=False, k=50):
    X_train, y_train = prepare_data(train, target)
    print(f'{X_train.shape[1]} features selected for {target}')

    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ])

    ordered_transformer = Pipeline(steps=[
            ('target', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
        ])

    numerical_features = cols_per_type(X_train, 'Continuous')
    print(f'Total number of numerical features: {len(numerical_features)}')
    categorical_features = cols_per_type(X_train, 'Ordered Categorical')
    print(f'Total number of categorical features: {len(categorical_features)}')
    binary_features = cols_per_type(X_train, 'Binary')
    print(f'Total number of binary features: {len(binary_features)}')
    ordinal_features = cols_per_type(X_train, 'Unordered Categorical')
    print(f'Total number of ordinal features: {len(ordinal_features)}')

    search_space = {
        'regressor__max_iter': space.Integer(10_000, 20_000),
    }
    
    if classifier:
        model = LogisticRegression(penalty='l2') 

        score = 'neg_brier_score'

        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
        ])

        preprocessor = make_column_transformer(
        (categorical_transformer, categorical_features),
        (numeric_transformer, numerical_features),
        (ordered_transformer, ordinal_features),
        (ordered_transformer, binary_features)
    )
        
        pipes = imbPipeline(steps=[('preprocessor', preprocessor),
                                    ('smote', SMOTE(random_state=24)),
                                    ('regressor', model)])
           
    else:
        model = Lasso()

        score = 'neg_mean_squared_error'

        search_space.update({
            'regressor__alpha': space.Real(800, 1500),
        })

        preprocessor = make_column_transformer(
        (categorical_transformer, categorical_features),
        (numeric_transformer, numerical_features),
        (categorical_transformer, ordinal_features),
        (ordered_transformer, binary_features)
    )

        pipes = Pipeline(steps=[('preprocessor', preprocessor),
                            ('regressor', model)])
                                           
    model = BayesSearchCV(
        pipes,
        search_space,
        n_iter=10,
        cv=5,
        n_jobs=-1,
        scoring=score,
        refit=True,
        verbose=0,
        random_state=41
    )
    
    model.fit(X_train, y_train)
    
    return model


# GPA

In [4]:
gpa_model = run_model(train,target='gpa', classifier=False)

1431 features selected for gpa
Total number of numerical features: 135
Total number of categorical features: 469
Total number of binary features: 692
Total number of ordinal features: 135


In [5]:
score_model(gpa_model,'gpa',test,leaderboard,holdout=None,classifier=False)

Metric: mean_squared_error
Best CV score: 0.3972
Standard deviation of CV scores: 0.0267
Mean CV score: 0.3983
Test MSE: 0.4291
Test R2: 0.0572
Leaderboard MSE: 0.4308
Leaderboard R2: -0.1028


# Material Hardship

In [6]:
model_materialHardship = run_model(train,target='materialHardship', classifier=False)

1431 features selected for materialHardship
Total number of numerical features: 135
Total number of categorical features: 469
Total number of binary features: 692
Total number of ordinal features: 135


In [7]:
score_model(model_materialHardship, target='materialHardship', test=test, leaderboard=leaderboard, holdout=holdout, classifier=False)

Metric: mean_squared_error
Best CV score: 0.0228
Standard deviation of CV scores: 0.0014
Mean CV score: 0.0233
Test MSE: 0.0264
Test R2: 0.0335
Leaderboard MSE: 0.0290
Leaderboard R2: -0.0141
Holdout MSE: 0.0246
Holdout R2: -0.0114


# Grit


In [8]:
model_grit = run_model(train,target='grit', classifier=False)

1431 features selected for grit
Total number of numerical features: 135
Total number of categorical features: 469
Total number of binary features: 692
Total number of ordinal features: 135


In [9]:
model_grit.best_params_

OrderedDict([('regressor__alpha', 829.4654059847685),
             ('regressor__max_iter', 19784)])

In [10]:
score_model(model_grit, target='grit', test=test, leaderboard=leaderboard, holdout=None, classifier=False)

Metric: mean_squared_error
Best CV score: 0.2328
Standard deviation of CV scores: 0.0277
Mean CV score: 0.2334
Test MSE: 0.2335
Test R2: 0.0085
Leaderboard MSE: 0.2234
Leaderboard R2: -0.0168


# Eviction

In [11]:
model_eviction = run_model(train,target='eviction', classifier=True)

1431 features selected for eviction
Total number of numerical features: 135
Total number of categorical features: 469
Total number of binary features: 692
Total number of ordinal features: 135


In [12]:
score_model(model_eviction,'eviction', test, leaderboard, holdout=None,classifier=True)

Test Brier: 0.0687
Test F1: 0.2857
Leaderboard Brier: 0.0792
Leaderboard F1: 0.1250


# jobTraining

In [13]:
model_job = run_model(train,target='jobTraining', classifier=True)

1431 features selected for jobTraining
Total number of numerical features: 135
Total number of categorical features: 469
Total number of binary features: 692
Total number of ordinal features: 135


In [14]:
score_model(model_job, 'jobTraining', test, leaderboard, holdout=None,classifier=True)

Test Brier: 0.3185
Test F1: 0.2677
Leaderboard Brier: 0.3585
Leaderboard F1: 0.3015


# Layoff

In [15]:
model_layoff = run_model(train, target='layoff', classifier=True)

1431 features selected for layoff
Total number of numerical features: 135
Total number of categorical features: 469
Total number of binary features: 692
Total number of ordinal features: 135


In [16]:
model_layoff.best_params_

OrderedDict([('regressor__max_iter', 10421)])

In [17]:
score_model(model_layoff, 'layoff', test, leaderboard, holdout=None, classifier=True)

Test Brier: 0.2669
Test F1: 0.3366
Leaderboard Brier: 0.3830
Leaderboard F1: 0.2397
