In [1]:
import pandas as pd
import numpy as np

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score


from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import *

In [20]:
placement_data = pd.read_csv('Placement_Data_Full_Class.csv')
y = placement_data.status
y = y.replace({"Placed": 1, "Not Placed": 0})
x = placement_data.drop(['sl_no', 'status'],axis=1)

numeric_preprocessor = Pipeline(
    steps=[
        ("iterative_impute", IterativeImputer()),
        ("scaler", StandardScaler()),
    ]
)

categorical_preprocessor = Pipeline(
    steps=[
        ("ohe",OneHotEncoder(sparse_output=False)),
    ]
)

preprocessor = ColumnTransformer(
    [
        ("categorical", categorical_preprocessor, x.select_dtypes(include='object').columns),
        ("numerical", numeric_preprocessor, x.select_dtypes(exclude='object').columns),
    ]
)

rf_n_estimators = [int(x) for x in np.linspace(start= 5, stop=200, num = 10)]
rf_max_depth = [int(x) for x in np.linspace(10, 110, num = 11)] + [None]
xgb_learning_rate = [x for x in np.arange(0.01, 0.3, 0.1)]
pipelines = []
log_reg_l1_param_set = {'L1_Logistic_Regression__C': [0.1, 0.5, 1, 2, 3, 4, 5, 8, 10], 'L1_Logistic_Regression__solver': ['liblinear', 'saga']}
log_reg_l2_param_set = {'L2_Logistic_Regression__C': [0.1, 0.5, 1, 2, 3, 4, 5, 8, 10], 'L2_Logistic_Regression__solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag' ,'saga']}
dt_param_set = {'Decision_Tree__criterion': ['gini', 'entropy'], 'Decision_Tree__max_depth': [2, 3, 5, 10, 20], 'Decision_Tree__min_samples_leaf': [5, 10, 20, 50, 100]}
rf_param_set = {'Random_Forest__n_estimators': rf_n_estimators, 'Random_Forest__max_features': ['log2', 'sqrt'], 'Random_Forest__max_depth': rf_max_depth, 'Random_Forest__min_samples_split' : [2,5, 10], 'Random_Forest__min_samples_leaf' : [1, 2, 4], 'Random_Forest__bootstrap' : [True, False]}
xgb_param_set = {'XGBoost__max_depth': [int(x) for x in np.linspace(start= 3, stop=18, num = 1)], 'XGBoost__learning_rate': xgb_learning_rate, 'XGBoost__gamma': [int(x) for x in np.linspace(start= 1, stop=9, num = 1)], 'XGBoost__reg_alpha': [int(x) for x in np.linspace(start= 5, stop=200, num = 10)], 'XGBoost__reg_lambda': [0.1, 0.3, 0.5, 0.7, 0.9, 1], 'XGBoost__colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1], 'XGBoost__min_child_weight': [int(x) for x in np.linspace(start= 0, stop=10, num = 1)]}


pipelines.append((Pipeline([('preprocessor', preprocessor), ('L1_Logistic_Regression', LogisticRegression(penalty='l1', max_iter=10000))]), log_reg_l1_param_set))
pipelines.append((Pipeline([('preprocessor', preprocessor), ('L2_Logistic_Regression', LogisticRegression(penalty='l2', max_iter=500))]), log_reg_l2_param_set))
pipelines.append((Pipeline([('preprocessor', preprocessor), ('Decision_Tree', DecisionTreeClassifier())]), dt_param_set))
pipelines.append((Pipeline([('preprocessor', preprocessor), ('Random_Forest', RandomForestClassifier())]), rf_param_set))
pipelines.append((Pipeline([('preprocessor', preprocessor), ('XGBoost', XGBClassifier())]), xgb_param_set))

model_name = []
results = []
for pipe ,params in pipelines:
    loocv = LeaveOneOut()
    clf = RandomizedSearchCV(estimator=pipe, param_distributions=params, cv = loocv, error_score='raise', n_jobs=-1)
    nested_score = cross_val_score(clf, X=x, y=y, cv=5, scoring='precision', n_jobs=-1)
    model_name = list(pipe.named_steps.keys())[1]
    print("Precision scores for %s is %s" % (model_name, nested_score))
    print("Mean precision score for %s is %s" % (model_name, nested_score.mean()))

  y = y.replace({"Placed": 1, "Not Placed": 0})


Precision scores for L1_Logistic_Regression is [0.875      0.79411765 0.9        0.84375    0.96153846]
Mean precision score for L1_Logistic_Regression is 0.8748812217194569




Precision scores for L2_Logistic_Regression is [0.875      0.8        0.93333333 0.84375    0.96153846]
Mean precision score for L2_Logistic_Regression is 0.882724358974359
Precision scores for Decision_Tree is [0.96551724 0.82857143 1.         0.82758621 0.96296296]
Mean precision score for Decision_Tree is 0.9169275679620507
Precision scores for Random_Forest is [0.93548387 0.83333333 0.9375     0.87878788 0.9       ]
Mean precision score for Random_Forest is 0.8970210166177909
Precision scores for XGBoost is [0.69767442 0.69767442 0.69767442 0.6744186  0.90322581]
Mean precision score for XGBoost is 0.7341335333833459


In [21]:
log_reg_l1_param_set = {'C': hp.uniform('l1_C', 0.1, 10),
                        'solver': hp.choice('l1_solver', ['liblinear', 'saga'])}

log_reg_l2_param_set = {'C': hp.uniform('l2_C', 0.1, 10),
                        'solver': hp.choice('l2_solver', ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag' ,'saga'])}

dt_param_set = {'criterion': hp.choice('dt_criterion', ['gini', 'entropy']),
                'max_depth': hp.quniform('dt_max_depth', 2, 20, 1),
                'min_samples_leaf': hp.quniform('dt_min_samples_leaf', 5, 100, 1)}

rf_param_set = {'n_estimators': hp.quniform("rf_n_estimators", 5, 200, 1), 
                'max_depth': hp.quniform("rf_max_depth", 3, 15, 1), 
                'max_features': hp.choice('rf_max_features', ['log2', 'sqrt']),
                'min_samples_split': hp.quniform("rf_min_samples_split", 2, 10, 1),
                'min_samples_leaf' : hp.quniform("rf_min_samples_leaf", 1, 10, 1),
                'bootstrap': hp.choice('rf_model_max_features', [True, False])}


xgb_param_set = {'max_depth': hp.quniform("xgb_max_depth", 3, 18, 1),
        'learning_rate': hp.uniform("xgb_learning_rate", 0.01, 0.3),
        'gamma': hp.uniform ('xgb_gamma', 1,9),
        'reg_alpha' : hp.quniform('xgb_reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('xgb_reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('xgb_colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('xgb_min_child_weight', 0, 10, 1)}

placement_data = pd.read_csv('Placement_Data_Full_Class.csv')
y = placement_data.status
y = y.replace({"Placed": 1, "Not Placed": 0})
x = placement_data.drop(['sl_no', 'status'],axis=1)

numeric_preprocessor = Pipeline(
    steps=[
        ("iterative_impute", IterativeImputer()),
        ("scaler", StandardScaler()),
    ]
)

categorical_preprocessor = Pipeline(
    steps=[
        ("ohe",OneHotEncoder(sparse_output=False)),
    ]
)

preprocessor = ColumnTransformer(
    [
        ("categorical", categorical_preprocessor, x.select_dtypes(include='object').columns),
        ("numerical", numeric_preprocessor, x.select_dtypes(exclude='object').columns),
    ]
)


new_x= preprocessor.fit_transform(x)


X_train, X_test, y_train, y_test = train_test_split(new_x, y, train_size=0.8, random_state=23)



def xgb_objective(param_set):
    classifier =XGBClassifier(
        learning_rate = param_set['learning_rate'], max_depth = int(param_set['max_depth']), gamma = param_set['gamma'],
        reg_alpha = int(param_set['reg_alpha']),min_child_weight=int(param_set['min_child_weight']),
        colsample_bytree=int(param_set['colsample_bytree']), early_stopping_rounds=5)
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    classifier.fit(X_train, y_train, eval_set=evaluation, verbose=False)
    

    pred = classifier.predict(X_test)
    prec = precision_score(y_test, pred)
    print (prec)
    return {'loss': -prec, 'status': STATUS_OK }

def lr_l1_objective(param_set):
    classifier = LogisticRegression(penalty='l1', C = param_set['C'], solver= param_set['solver'], max_iter=10000)
    classifier.fit(X_train, y_train)
    

    pred = classifier.predict(X_test)
    prec = precision_score(y_test, pred)
    print (prec)
    return {'loss': -prec, 'status': STATUS_OK }

def lr_l2_objective(param_set):
    classifier = LogisticRegression(penalty='l2', C = param_set['C'], solver= param_set['solver'], max_iter=500)
    classifier.fit(X_train, y_train)
    

    pred = classifier.predict(X_test)
    prec = precision_score(y_test, pred)
    print (prec)
    return {'loss': -prec, 'status': STATUS_OK }

def dt_objective(param_set):
    classifier = DecisionTreeClassifier(criterion= param_set['criterion'],max_depth = int(param_set['max_depth']), min_samples_leaf=int(param_set['min_samples_leaf']))
    classifier.fit(X_train, y_train)
    

    pred = classifier.predict(X_test)
    prec = precision_score(y_test, pred)
    print (prec)
    return {'loss': -prec, 'status': STATUS_OK }

def rf_objective(param_set):
    classifier = RandomForestClassifier(n_estimators = int(param_set['n_estimators']), max_depth = int(param_set['max_depth']), max_features = param_set['max_features'], min_samples_split = int(param_set['min_samples_split']), min_samples_leaf = int(param_set['min_samples_leaf']), bootstrap = bool(param_set['bootstrap']))
    classifier.fit(X_train, y_train)
    

    pred = classifier.predict(X_test)
    prec = precision_score(y_test, pred)
    print (prec)
    return {'loss': -prec, 'status': STATUS_OK }


models = [('XGBoost', xgb_objective, xgb_param_set), ('Logistic Regression L1', lr_l1_objective, log_reg_l1_param_set), ('Logistic Regression L2', lr_l2_objective, log_reg_l2_param_set), ('Decision Tree', dt_objective, dt_param_set), ('Random Forest', rf_objective, rf_param_set)]
best_hyperparams_per_model = []

for model in models:
    trials = Trials()
    print("The precision score for model " +  model[0] + " is: ", )
    best_hyperparams = fmin(fn = model[1],
                            space = model[2],
                            algo = tpe.suggest,
                            max_evals = 100,
                            trials = trials)
    best_hyperparams_per_model.append(best_hyperparams)

The precision score for model %s is:  XGBoost
0.7441860465116279                                     
0.7441860465116279                                     
0.7441860465116279                                                                
0.7441860465116279                                                                
0.7441860465116279                                                                
0.7441860465116279                                                                
0.7441860465116279                                                                
0.7441860465116279                                                                
0.7441860465116279                                                                
0.7441860465116279                                                                
0.7441860465116279                                                                
0.7441860465116279                                                                 
0.744186046

  y = y.replace({"Placed": 1, "Not Placed": 0})


0.7441860465116279                                                                 
0.7441860465116279                                                                 
0.7441860465116279                                                                 
0.7441860465116279                                                                 
0.7441860465116279                                                                 
0.7441860465116279                                                                 
0.7441860465116279                                                                 
0.7441860465116279                                                                 
0.7441860465116279                                                                 
0.7441860465116279                                                                 
0.7441860465116279                                                                 
0.7441860465116279                                                          