In [1]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer, TransformedTargetRegressor
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.dummy import DummyClassifier
from sklearn.metrics import mean_squared_error, r2_score, brier_score_loss, f1_score
from sklearn.model_selection import cross_validate
from xgboost import XGBClassifier, XGBRegressor
from skopt import BayesSearchCV
import skopt.space as space
import skopt.plots as plots
import imblearn
from imblearn.pipeline import Pipeline as imbPipeline
from imblearn.over_sampling import SMOTE
from ff_custom_scripts import *

from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel

from sklearn.impute import KNNImputer

from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score
# import catboost as cb
from catboost import CatBoostClassifier, Pool, cv, CatBoostRegressor
# import gridsearchcv
from sklearn.model_selection import GridSearchCV


In [2]:
train, test,leaderboard,holdout = load_files(nanvalues='keep')

alldata = pd.concat([train, test]) # all data available for training in the FF Challenge

print(train.shape, test.shape,leaderboard.shape,holdout.shape)

(1172, 1437) (294, 1437) (530, 1437) (1111, 1437)


# Model

In [3]:
# def run_model(train, test, target='gpa', classifier=False):

#     catcols = train.select_dtypes(include=['category']).columns
#     train[catcols] = train[catcols].astype(str)
#     test[catcols] = test[catcols].astype(str)

#     X_train, y_train = prepare_data(train, target)
#     X_test, y_test = prepare_data(test, target)

#     print(X_train.shape, y_train.shape)

#     numerical_features = cols_per_type(X_train, 'Continuous')
#     print(f'N. of numerical features: {len(numerical_features)}')
#     ordinal_features = cols_per_type(X_train, 'Ordered Categorical')
#     binary_features = cols_per_type(X_train, 'Binary')
#     print(f'N. of binary features: {len(binary_features)}')
#     categorical_features = cols_per_type(X_train, 'Unordered Categorical')
#     print(f'N. of categorical features: {len(categorical_features)}')
#     print(f'N. of ordinal features: {len(ordinal_features)}')
    
#     iter = 20_000

#     if classifier:
#         score = 'neg_brier_score'
#         catboost = CatBoostClassifier(
#             eval_metric='Logloss', loss_function='Logloss', thread_count=-1, cat_features=categorical_features, verbose=0)
#     else:
        
#         catboost = CatBoostRegressor(thread_count=-1, cat_features=categorical_features, random_seed=123,
#                                      iterations=iter, early_stopping_rounds=iter*0.05,verbose=0)

#         score = 'neg_mean_squared_error'

#     param_grid = {
#         'learning_rate': [0.005, 0.01],
#         'depth': [4, 8],
#         'l2_leaf_reg': [2, 4],
#         'bagging_temperature': [0.5, 1],
#     }

#     model = BayesSearchCV(catboost, param_grid, cv=5, n_jobs=-1, scoring=score, verbose=1,
#                            n_iter=10).fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True)
    
#     return model


# GPA

In [33]:
def run_model(train, test, target='gpa', classifier=False):

    catcols = train.select_dtypes(include=['category']).columns
    train[catcols] = train[catcols].astype(str)
    test[catcols] = test[catcols].astype(str)

    X_train, y_train = prepare_data(train, target)
    X_test, y_test = prepare_data(test, target)

    print(X_train.shape, y_train.shape)

    numerical_features = cols_per_type(X_train, 'Continuous')
    print(f'N. of numerical features: {len(numerical_features)}')
    ordinal_features = cols_per_type(X_train, 'Ordered Categorical')
    binary_features = cols_per_type(X_train, 'Binary')
    print(f'N. of binary features: {len(binary_features)}')
    categorical_features = cols_per_type(X_train, 'Unordered Categorical')
    print(f'N. of categorical features: {len(categorical_features)}')
    print(f'N. of ordinal features: {len(ordinal_features)}')
    
    iter = 20_000

    if classifier:
        score = 'neg_brier_score'
        catboost = CatBoostClassifier(
            eval_metric='Logloss', loss_function='Logloss', thread_count=-1, cat_features=categorical_features, verbose=0)
    else:
        
        catboost = CatBoostRegressor(thread_count=-1, cat_features=categorical_features, random_seed=123,
                                     iterations=iter, early_stopping_rounds=iter*0.1,verbose=10)

        score = 'neg_mean_squared_error'

    param_grid = {
        'learning_rate': [0.005, 0.02],
        'depth': [4, 8],
        'l2_leaf_reg': [2, 3],
        'bagging_temperature': [0.5, 1],
    }

    # model = BayesSearchCV(catboost, param_grid, cv=5, n_jobs=-1, scoring=score, verbose=20,
    #                        n_iter=10).fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True, verbose=0)
    
    model = catboost.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True, verbose=0)

    return model


def score_model(model, data, target='gpa', classifier=False):
    X, y = prepare_data(data, target)
    if classifier:
        y_pred = model.predict_proba(X)
        # get 0 or 1 based on the highest probability
        y_pred = np.argmax(y_pred, axis=1)
        return brier_score_loss(y, y_pred)
        
    else:
        y_pred = model.predict(X)
        return mean_squared_error(y, y_pred)

In [5]:
gpa_model = run_model(train,test,target='gpa', classifier=False)

(929, 1431) (929,)
N. of numerical features: 135
N. of binary features: 692
N. of categorical features: 135
N. of ordinal features: 469


In [11]:
score_model(gpa_model, test, target='gpa', classifier=False)

0.405033814007716

In [6]:
score_model(gpa_model, leaderboard, target='gpa', classifier=False)

0.38116522430321753

# Material Hardship

In [12]:
model_materialHardship = run_model(train,test, target='materialHardship', classifier=False)

(1167, 1431) (1167,)
N. of numerical features: 135
N. of binary features: 692
N. of categorical features: 135
N. of ordinal features: 469


In [13]:
score_model(model_materialHardship, test, target='materialHardship', classifier=False)

0.01970861724690436

In [14]:
score_model(model_materialHardship, leaderboard, target='materialHardship', classifier=False)

0.024661091378832763

# Grit


In [15]:
model_grit = run_model(train,test, target='grit', classifier=False)

(1134, 1431) (1134,)
N. of numerical features: 135
N. of binary features: 692
N. of categorical features: 135
N. of ordinal features: 469


In [17]:
score_model(model_grit, test, target='grit', classifier=False)

0.21626631143339764

In [19]:
score_model(model_grit, leaderboard, target='grit', classifier=False)

0.22790289790229024

# Eviction

In [18]:
model_eviction = run_model(train,test,target='eviction', classifier=True)

(1167, 1431) (1167,)
N. of numerical features: 135
N. of binary features: 692
N. of categorical features: 135
N. of ordinal features: 469


In [34]:
score_model(model_eviction, test, target='eviction', classifier=True)

0.05136986301369863

In [35]:
score_model(model_eviction, leaderboard, target='eviction', classifier=True)

0.05660377358490566

# jobTraining

In [24]:
model_job = run_model(train,test,target='jobTraining', classifier=True)

(1169, 1431) (1169,)
N. of numerical features: 135
N. of binary features: 692
N. of categorical features: 135
N. of ordinal features: 469


In [36]:
score_model(model_job, test, target='jobTraining', classifier=True)

0.22602739726027396

In [37]:
score_model(model_job, leaderboard, target='jobTraining', classifier=True)

0.2849056603773585

# Layoff

In [38]:
model_layoff = run_model(train, test, target='layoff', classifier=True)

(1026, 1431) (1026,)
N. of numerical features: 135
N. of binary features: 692
N. of categorical features: 135
N. of ordinal features: 469


In [39]:
score_model(model_layoff, test, target='layoff', classifier=True)

0.199203187250996

In [40]:
score_model(model_layoff, leaderboard, target='layoff', classifier=True)

0.22452830188679246