In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.decomposition import TruncatedSVD,PCA
from sklearn.model_selection import train_test_split 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import RocCurveDisplay,roc_auc_score
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.svm import SVC

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import VarianceThreshold

In [None]:
path = "/content/drive/MyDrive/Materias/MLEA_M/Coursework/"

df_train = pd.read_csv(path + "spam_dataset.csv")
df_test = pd.read_csv(path + "spam_dataset_test.csv")

In [None]:
x_test=df_test.drop("email_id",axis=1)

In [None]:
df_train2 = df_train.drop('email_id', axis=1)
x_train = df_train2.drop('spam',axis=1)
y_train = df_train2[['spam']]

In [None]:
columns_all= list(df_train.drop(['email_id','spam'],axis=1))
columns_freq = list(filter(lambda x: 'word_freq' in x, list(df_train.drop(['email_id','spam'], axis=1).columns)))
columns_no_freq = list(filter(lambda x: 'word_freq' not in x, list(df_train.drop(['email_id','spam'], axis=1).columns)))

In [None]:
y_train.shape, x_test.shape

((3220, 1), (1381, 57))

# Full Features


## LR

### MinMax

In [None]:
clf = Pipeline(
    steps=[("preprocessor", MinMaxScaler((-1,1))), ("clf", LogisticRegression(max_iter=2000))]
)
params={'clf__C':[80,100,200,500,800,1000],
        'clf__penalty':['l1', 'l2'],
        'clf__class_weight':['balanced'],
        'clf__solver': ['saga'],
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
0,"{'clf__C': 80, 'clf__class_weight': 'balanced'...",0.969608,0.005245,0.923651,0.011733
2,"{'clf__C': 100, 'clf__class_weight': 'balanced...",0.969588,0.005231,0.923651,0.011733
4,"{'clf__C': 200, 'clf__class_weight': 'balanced...",0.969534,0.005225,0.923651,0.011733
6,"{'clf__C': 500, 'clf__class_weight': 'balanced...",0.96949,0.005207,0.923651,0.011733
7,"{'clf__C': 500, 'clf__class_weight': 'balanced...",0.969486,0.005294,0.923651,0.011733
9,"{'clf__C': 800, 'clf__class_weight': 'balanced...",0.969484,0.005267,0.923651,0.011733
11,"{'clf__C': 1000, 'clf__class_weight': 'balance...",0.969484,0.005248,0.923651,0.011733
10,"{'clf__C': 1000, 'clf__class_weight': 'balance...",0.969481,0.005203,0.923651,0.011733
8,"{'clf__C': 800, 'clf__class_weight': 'balanced...",0.969479,0.005206,0.923651,0.011733
5,"{'clf__C': 200, 'clf__class_weight': 'balanced...",0.969421,0.005359,0.922879,0.01306


In [None]:
clf.set_params(**GS.best_params_)
clf.fit(x_train,y_train)

model_results = pd.DataFrame({"email_id": df_test.email_id, "predicted" : clf.predict_proba(x_test)[:,1]})
model_results.to_csv("lr_scaler_full.csv", index=False)

### StandarScaler

In [None]:
clf = Pipeline(
    steps=[("preprocessor", StandardScaler()), ("clf", LogisticRegression(max_iter=2000))]
)
params={'clf__C':[80,100,200],
        'clf__penalty':['l1', 'l2'],
        'clf__class_weight':['balanced'],
        'clf__solver': ['saga'],
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
0,"{'clf__C': 80, 'clf__class_weight': 'balanced'...",0.968908,0.005938,0.920154,0.01391
2,"{'clf__C': 100, 'clf__class_weight': 'balanced...",0.968906,0.005934,0.920154,0.01391
4,"{'clf__C': 200, 'clf__class_weight': 'balanced...",0.968906,0.005927,0.920154,0.01391
5,"{'clf__C': 200, 'clf__class_weight': 'balanced...",0.968896,0.00594,0.920154,0.01391
3,"{'clf__C': 100, 'clf__class_weight': 'balanced...",0.968891,0.005953,0.920154,0.01391
1,"{'clf__C': 80, 'clf__class_weight': 'balanced'...",0.968881,0.005966,0.920154,0.01391


In [None]:
clf.set_params(**GS.best_params_)
clf.fit(x_train,y_train)

model_results = pd.DataFrame({"email_id": df_test.email_id, "predicted" : clf.predict_proba(x_test)[:,1]})
model_results.to_csv("lr_minmax_full.csv", index=False)

### TF-IDF transformation

In [None]:
ct = ColumnTransformer([("scaler", MinMaxScaler((-1,1)), columns_no_freq),
                        ('tfidf',TfidfTransformer(),columns_freq)])

clf = Pipeline(
    steps=[("pre", ct), ("clf", LogisticRegression(max_iter=2000))]
)
params={'clf__C':[80,100,200],
        'clf__penalty':['l1', 'l2'],
        'clf__class_weight':['balanced'],
        'clf__solver': ['saga'],
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
1,"{'clf__C': 80, 'clf__class_weight': 'balanced'...",0.973827,0.005233,0.924345,0.006737
3,"{'clf__C': 100, 'clf__class_weight': 'balanced...",0.973798,0.005221,0.924087,0.00698
5,"{'clf__C': 200, 'clf__class_weight': 'balanced...",0.973752,0.005198,0.924353,0.006407
0,"{'clf__C': 80, 'clf__class_weight': 'balanced'...",0.973597,0.005211,0.924087,0.006862
2,"{'clf__C': 100, 'clf__class_weight': 'balanced...",0.973557,0.005246,0.924478,0.006693
4,"{'clf__C': 200, 'clf__class_weight': 'balanced...",0.973504,0.00529,0.924478,0.006489


In [None]:
clf.set_params(**GS.best_params_)
clf.fit(x_train,y_train)

model_results = pd.DataFrame({"email_id": df_test.email_id, "predicted" : clf.predict_proba(x_test)[:,1]})
model_results.to_csv("lr_tfidf_minmax_full.csv", index=False)

Puede que no sea un modelo muy complejo

## SVM

### MinMax

In [None]:
clf = Pipeline(
    steps=[("preprocessor", MinMaxScaler((-1,1))), ("clf", SVC())]
)
params={'clf__C':[5,10,20,50,80],
        'clf__kernel':['rbf'],
        'clf__class_weight':['balanced'],
        'clf__gamma': ['auto','scale'],
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
5,"{'clf__C': 20, 'clf__class_weight': 'balanced'...",0.973534,0.005832,0.922264,0.01062
3,"{'clf__C': 10, 'clf__class_weight': 'balanced'...",0.973349,0.005741,0.924989,0.008339
1,"{'clf__C': 5, 'clf__class_weight': 'balanced',...",0.972534,0.006003,0.925393,0.011385
7,"{'clf__C': 50, 'clf__class_weight': 'balanced'...",0.970884,0.006304,0.920312,0.007807
9,"{'clf__C': 80, 'clf__class_weight': 'balanced'...",0.968712,0.006986,0.918096,0.009086
8,"{'clf__C': 80, 'clf__class_weight': 'balanced'...",0.966887,0.01091,0.925147,0.01621
6,"{'clf__C': 50, 'clf__class_weight': 'balanced'...",0.965572,0.010647,0.924488,0.016266
4,"{'clf__C': 20, 'clf__class_weight': 'balanced'...",0.961464,0.012103,0.917715,0.015525
2,"{'clf__C': 10, 'clf__class_weight': 'balanced'...",0.956945,0.013737,0.906389,0.017502
0,"{'clf__C': 5, 'clf__class_weight': 'balanced',...",0.949152,0.015495,0.896908,0.016455


In [None]:
clf.set_params(**GS.best_params_)
clf.fit(x_train,y_train)

model_results = pd.DataFrame({"email_id": df_test.email_id, "predicted" : clf.predict_proba(x_test)[:,1]})
model_results.to_csv("svm_minmax_full.csv", index=False)

### StandarScaler

In [None]:
clf = Pipeline(
    steps=[("preprocessor", StandardScaler()), ("clf", SVC())]
)
params={'clf__C':[5,10,20,50,80],
        'clf__kernel':['rbf'],
        'clf__class_weight':['balanced'],
        'clf__gamma': ['auto','scale'],
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
2,"{'clf__C': 10, 'clf__class_weight': 'balanced'...",0.976856,0.006701,0.925913,0.014534
3,"{'clf__C': 10, 'clf__class_weight': 'balanced'...",0.976856,0.006701,0.925913,0.014534
0,"{'clf__C': 5, 'clf__class_weight': 'balanced',...",0.975974,0.007012,0.926441,0.013836
1,"{'clf__C': 5, 'clf__class_weight': 'balanced',...",0.975974,0.007012,0.926441,0.013836
4,"{'clf__C': 20, 'clf__class_weight': 'balanced'...",0.975671,0.005821,0.92435,0.011325
5,"{'clf__C': 20, 'clf__class_weight': 'balanced'...",0.975671,0.005821,0.92435,0.011325
6,"{'clf__C': 50, 'clf__class_weight': 'balanced'...",0.973678,0.005659,0.922932,0.011427
7,"{'clf__C': 50, 'clf__class_weight': 'balanced'...",0.973678,0.005659,0.922932,0.011427
8,"{'clf__C': 80, 'clf__class_weight': 'balanced'...",0.971586,0.006738,0.916575,0.013708
9,"{'clf__C': 80, 'clf__class_weight': 'balanced'...",0.971586,0.006738,0.916575,0.013708


In [None]:
clf.set_params(**GS.best_params_)
clf.fit(x_train,y_train)

model_results = pd.DataFrame({"email_id": df_test.email_id, "predicted" : clf.predict_proba(x_test)[:,1]})
model_results.to_csv("lr_minmax_full.csv", index=False)

### TF-IDF transformation

In [None]:
ct = ColumnTransformer([("scaler", MinMaxScaler((-1,1)), columns_no_freq),
                        ('tfidf',TfidfTransformer(),columns_freq)])

clf = Pipeline(
    steps=[("pre", ct), ("clf", SVC())]
)
params={'clf__C':[5,10,20,50,80],
        'clf__kernel':['rbf'],
        'clf__class_weight':['balanced'],
        'clf__gamma': ['auto','scale'],
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
7,"{'clf__C': 50, 'clf__class_weight': 'balanced'...",0.97853,0.006845,0.934146,0.009237
5,"{'clf__C': 20, 'clf__class_weight': 'balanced'...",0.978344,0.006011,0.932729,0.007638
9,"{'clf__C': 80, 'clf__class_weight': 'balanced'...",0.977749,0.007363,0.935832,0.009059
3,"{'clf__C': 10, 'clf__class_weight': 'balanced'...",0.977109,0.005739,0.931027,0.006781
1,"{'clf__C': 5, 'clf__class_weight': 'balanced',...",0.975396,0.005776,0.925342,0.007583
8,"{'clf__C': 80, 'clf__class_weight': 'balanced'...",0.974038,0.00601,0.926005,0.009194
6,"{'clf__C': 50, 'clf__class_weight': 'balanced'...",0.973211,0.006346,0.924842,0.0087
4,"{'clf__C': 20, 'clf__class_weight': 'balanced'...",0.972288,0.006029,0.919933,0.008292
2,"{'clf__C': 10, 'clf__class_weight': 'balanced'...",0.971244,0.005698,0.920975,0.008894
0,"{'clf__C': 5, 'clf__class_weight': 'balanced',...",0.969084,0.006602,0.919943,0.011603


In [None]:
clf.set_params(**GS.best_params_)
clf.fit(x_train,y_train)

model_results = pd.DataFrame({"email_id": df_test.email_id, "predicted" : clf.predict_proba(x_test)[:,1]})
model_results.to_csv("lr_tfidf_minmax_full.csv", index=False)

Puede que no sea un modelo muy complejo

## RF

### MinMax

In [None]:
clf = Pipeline(
    steps=[("preprocessor", MinMaxScaler((-1,1))),("clf", RandomForestClassifier())]
)
params={'clf__n_estimators':[100,150,200,250],
        'clf__class_weight':['balanced'],
        'clf__max_features': ['log2'],
        'clf__min_samples_split': [3,5,8,10,15]
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
10,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.986331,0.004236,0.943279,0.011349
1,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.986305,0.004791,0.946121,0.009484
6,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.986262,0.003775,0.942352,0.011828
4,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.986254,0.004603,0.943538,0.011909
7,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.986175,0.003948,0.946256,0.011629
3,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.986057,0.004062,0.945596,0.010326
5,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.985835,0.00447,0.942871,0.012687
0,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.985751,0.004596,0.943767,0.008895
11,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.985677,0.004192,0.944042,0.012373
2,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.985645,0.003817,0.943898,0.009589


In [None]:
clf.set_params(**GS.best_params_)
clf.fit(x_train,y_train)

model_results = pd.DataFrame({"email_id": df_test.email_id, "predicted" : clf.predict_proba(x_test)[:,1]})
# model_results.to_csv("svm_minmax_full.csv", index=False)

### StandarScaler

In [None]:
clf = Pipeline(
    steps=[("preprocessor", StandardScaler()),("clf", RandomForestClassifier())]
)
params={'clf__n_estimators':[100,150,200,250],
        'clf__class_weight':['balanced'],
        'clf__max_features': ['log2'],
        'clf__min_samples_split': [3,5,8,10,15]
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
3,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.986455,0.003993,0.944425,0.011275
7,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.986267,0.004186,0.946645,0.011345
0,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.98614,0.004451,0.943901,0.009552
6,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.985964,0.004107,0.945606,0.01201
2,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.985882,0.003959,0.946111,0.009928
8,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.98584,0.003996,0.944835,0.012462
5,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.985808,0.004545,0.945602,0.010758
15,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.985729,0.003816,0.942239,0.012413
11,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.985658,0.004256,0.941715,0.013014
13,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.985513,0.004214,0.94042,0.011805


In [None]:
clf.set_params(**GS.best_params_)
clf.fit(x_train,y_train)

model_results = pd.DataFrame({"email_id": df_test.email_id, "predicted" : clf.predict_proba(x_test)[:,1]})
# model_results.to_csv("lr_minmax_full.csv", index=False)

### TF-IDF transformation

In [None]:
ct = ColumnTransformer([("scaler", MinMaxScaler((-1,1)), columns_no_freq),
                        ('tfidf',TfidfTransformer(),columns_freq)])

clf = Pipeline(
    steps=[("pre", ct),("clf", RandomForestClassifier())]
)
params={'clf__n_estimators':[100,150,200,250],
        'clf__class_weight':['balanced'],
        'clf__max_features': ['log2'],
        'clf__min_samples_split': [3,5,8,10,15]
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
7,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.987165,0.003803,0.945225,0.008403
11,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.987133,0.003761,0.943804,0.010657
3,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.987128,0.003801,0.944948,0.010144
9,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.986951,0.003509,0.94328,0.010029
1,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.986949,0.003718,0.942507,0.009469
2,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.986947,0.004194,0.943146,0.010555
6,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.98693,0.003835,0.946121,0.010069
5,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.986726,0.003759,0.943795,0.011309
13,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.986702,0.003877,0.94278,0.008739
15,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.986693,0.004128,0.940819,0.00969


In [None]:
clf.set_params(**	{'clf__class_weight': 'balanced', 'clf__max_features': 'log2', 'clf__min_samples_split': 5, 'clf__n_estimators': 250})
clf.fit(x_train,y_train)

model_results = pd.DataFrame({"email_id": df_test.email_id, "predicted" : clf.predict_proba(x_test)[:,1]})
# model_results.to_csv("rf_tfidf_minmax_full.csv", index=False)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


## GBM

### MinMax

In [None]:
clf = Pipeline(
    steps=[("preprocessor", MinMaxScaler((-1,1))),("clf", GradientBoostingClassifier())]
)
params={'clf__n_estimators':[80,100,150,200,250],
        'clf__learning_rate':[0.01,0.1],
        'clf__max_features': ['log2'],
        'clf__min_samples_split': [5,6,8,10]
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
39,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986925,0.002973,0.948486,0.011571
29,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986572,0.002673,0.948604,0.008239
34,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986522,0.002662,0.948088,0.009049
24,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986447,0.002988,0.946915,0.008618
28,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986076,0.003528,0.946775,0.010288
23,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985784,0.002704,0.947034,0.009857
22,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985741,0.003722,0.942866,0.011552
33,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985639,0.002838,0.946257,0.009722
32,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985381,0.003042,0.944163,0.012651
38,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985218,0.002711,0.947689,0.010801


In [None]:
clf.set_params(**GS.best_params_)
clf.fit(x_train,y_train)

model_results = pd.DataFrame({"email_id": df_test.email_id, "predicted" : clf.predict_proba(x_test)[:,1]})
# model_results.to_csv("svm_minmax_full.csv", index=False)

### StandarScaler

In [None]:
clf = Pipeline(
    steps=[("preprocessor", StandardScaler()),("clf", GradientBoostingClassifier())]
)
params={'clf__n_estimators':[80,100,150,200,250],
        'clf__learning_rate':[0.01,0.1],
        'clf__max_features': ['log2'],
        'clf__min_samples_split': [5,6,8,10]
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
39,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986781,0.003096,0.946916,0.009453
34,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986517,0.002448,0.947049,0.005705
29,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986516,0.002347,0.950032,0.00758
28,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986184,0.003093,0.945335,0.008678
33,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986123,0.003338,0.944584,0.009393
22,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985843,0.00406,0.943381,0.007812
24,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985814,0.003026,0.946268,0.010356
32,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985649,0.003428,0.943266,0.011465
37,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985473,0.003278,0.946898,0.012438
27,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985394,0.003175,0.94274,0.010608


In [None]:
clf.set_params(**{'clf__learning_rate': 0.1, 'clf__max_features': 'log2', 'clf__min_samples_split': 10, 'clf__n_estimators': 250})
clf.fit(x_train,y_train)

model_results = pd.DataFrame({"email_id": df_test.email_id, "predicted" : clf.predict_proba(x_test)[:,1]})
model_results.to_csv("gbm_standard_full.csv", index=False)

  y = column_or_1d(y, warn=True)


### TF-IDF transformation

In [None]:
ct = ColumnTransformer([("scaler", MinMaxScaler((-1,1)), columns_no_freq),
                        ('tfidf',TfidfTransformer(),columns_freq)])

clf = Pipeline(
    steps=[("pre", ct),("clf", GradientBoostingClassifier())]
)
params={'clf__n_estimators':[80,100,150,200,250],
        'clf__learning_rate':[0.01,0.1],
        'clf__max_features': ['log2'],
        'clf__min_samples_split': [5,6,8,10]
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
39,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985777,0.003767,0.946796,0.014077
29,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985577,0.00362,0.94523,0.011309
33,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985328,0.003196,0.943923,0.012063
24,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985257,0.003212,0.946265,0.009709
38,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985147,0.002447,0.944312,0.010744
34,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985133,0.002798,0.946803,0.01011
37,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985106,0.003635,0.944954,0.015104
28,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985018,0.002556,0.945356,0.011605
23,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.984734,0.002703,0.941073,0.013995
32,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.984627,0.003363,0.940537,0.01272


In [None]:
clf.set_params(**	{'clf__class_weight': 'balanced', 'clf__max_features': 'log2', 'clf__min_samples_split': 5, 'clf__n_estimators': 250})
clf.fit(x_train,y_train)

model_results = pd.DataFrame({"email_id": df_test.email_id, "predicted" : clf.predict_proba(x_test)[:,1]})
# model_results.to_csv("rf_tfidf_minmax_full.csv", index=False)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


# CHI

## LR

### MinMax

In [None]:
clf = Pipeline(
    steps=[('selector', SelectKBest(chi2, k=40)),("preprocessor", MinMaxScaler((-1,1))), ("clf", LogisticRegression(max_iter=2000))]
)
params={'selector__k':[35,40,45,50],
        'clf__C':[80,100,200,500,800,1000],
        'clf__penalty':['l1', 'l2'],
        'clf__class_weight':['balanced'],
        'clf__solver': ['saga'],
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

KeyboardInterrupt: ignored

In [None]:
clf.set_params(**GS.best_params_)
clf.fit(x_train,y_train)

model_results = pd.DataFrame({"email_id": df_test.email_id, "predicted" : clf.predict_proba(x_test)[:,1]})
model_results.to_csv("lr_scaler_full.csv", index=False)

### StandarScaler

In [None]:
clf = Pipeline(
    steps=[('selector', SelectKBest(chi2, k=40)),("preprocessor", StandardScaler()), ("clf", LogisticRegression(max_iter=2000))]
)
params={'selector__k':[35,40,45,50],
        'clf__penalty':['l1', 'l2'],
        'clf__class_weight':['balanced'],
        'clf__solver': ['saga'],
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
0,"{'clf__C': 80, 'clf__class_weight': 'balanced'...",0.968908,0.005938,0.920154,0.01391
2,"{'clf__C': 100, 'clf__class_weight': 'balanced...",0.968906,0.005934,0.920154,0.01391
4,"{'clf__C': 200, 'clf__class_weight': 'balanced...",0.968906,0.005927,0.920154,0.01391
5,"{'clf__C': 200, 'clf__class_weight': 'balanced...",0.968896,0.00594,0.920154,0.01391
3,"{'clf__C': 100, 'clf__class_weight': 'balanced...",0.968891,0.005953,0.920154,0.01391
1,"{'clf__C': 80, 'clf__class_weight': 'balanced'...",0.968881,0.005966,0.920154,0.01391


In [None]:
clf.set_params(**GS.best_params_)
clf.fit(x_train,y_train)

model_results = pd.DataFrame({"email_id": df_test.email_id, "predicted" : clf.predict_proba(x_test)[:,1]})
model_results.to_csv("lr_minmax_full.csv", index=False)

## SVM

### MinMax

In [None]:
clf = Pipeline(
    steps=[ ('selector', SelectKBest(chi2, k=40)) ,("preprocessor", MinMaxScaler((-1,1))), ("clf", SVC())]
)
params={'selector__k':[35,40,45,50],
        'clf__C':[5,10,20,50,80],
        'clf__kernel':['rbf'],
        'clf__class_weight':['balanced'],
        'clf__gamma': ['auto','scale'],
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
21,"{'clf__C': 20, 'clf__class_weight': 'balanced'...",0.97377,0.004577,0.926039,0.007427
13,"{'clf__C': 10, 'clf__class_weight': 'balanced'...",0.973615,0.004893,0.924992,0.006903
23,"{'clf__C': 20, 'clf__class_weight': 'balanced'...",0.973463,0.004664,0.923437,0.002812
22,"{'clf__C': 20, 'clf__class_weight': 'balanced'...",0.973108,0.00367,0.919255,0.005203
14,"{'clf__C': 10, 'clf__class_weight': 'balanced'...",0.972869,0.003792,0.92331,0.008814
15,"{'clf__C': 10, 'clf__class_weight': 'balanced'...",0.972593,0.00515,0.924745,0.006524
5,"{'clf__C': 5, 'clf__class_weight': 'balanced',...",0.972036,0.005258,0.926941,0.009883
6,"{'clf__C': 5, 'clf__class_weight': 'balanced',...",0.971669,0.004445,0.923187,0.008921
7,"{'clf__C': 5, 'clf__class_weight': 'balanced',...",0.971583,0.005906,0.923835,0.008347
12,"{'clf__C': 10, 'clf__class_weight': 'balanced'...",0.971159,0.004557,0.923455,0.009344


In [None]:
clf.set_params(**GS.best_params_)
clf.fit(x_train,y_train)

model_results = pd.DataFrame({"email_id": df_test.email_id, "predicted" : clf.predict_proba(x_test)[:,1]})
model_results.to_csv("svm_minmax_full.csv", index=False)

### StandarScaler

In [None]:
clf = Pipeline(
    steps=[('selector', SelectKBest(chi2, k=40)),("preprocessor", StandardScaler()), ("clf", SVC())]
)
params={'selector__k':[35,40,45,50],
        'clf__C':[5,10,20,50,80],
        'clf__kernel':['rbf'],
        'clf__class_weight':['balanced'],
        'clf__gamma': ['auto','scale'],
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
10,"{'clf__C': 10, 'clf__class_weight': 'balanced'...",0.97561,0.004338,0.924868,0.010002
14,"{'clf__C': 10, 'clf__class_weight': 'balanced'...",0.97561,0.004338,0.924868,0.010002
9,"{'clf__C': 10, 'clf__class_weight': 'balanced'...",0.975564,0.005841,0.925523,0.012239
13,"{'clf__C': 10, 'clf__class_weight': 'balanced'...",0.975564,0.005841,0.925523,0.012239
11,"{'clf__C': 10, 'clf__class_weight': 'balanced'...",0.975308,0.00477,0.924738,0.010644
15,"{'clf__C': 10, 'clf__class_weight': 'balanced'...",0.975308,0.00477,0.924738,0.010644
19,"{'clf__C': 20, 'clf__class_weight': 'balanced'...",0.974872,0.004271,0.923307,0.00703
23,"{'clf__C': 20, 'clf__class_weight': 'balanced'...",0.974872,0.004271,0.923307,0.00703
18,"{'clf__C': 20, 'clf__class_weight': 'balanced'...",0.974648,0.003635,0.924601,0.006438
22,"{'clf__C': 20, 'clf__class_weight': 'balanced'...",0.974648,0.003635,0.924601,0.006438


In [None]:
clf.set_params(**GS.best_params_)
clf.fit(x_train,y_train)

model_results = pd.DataFrame({"email_id": df_test.email_id, "predicted" : clf.predict_proba(x_test)[:,1]})
model_results.to_csv("lr_minmax_full.csv", index=False)

## RF

### MinMax

In [None]:
clf = Pipeline(
    steps=[('selector', SelectKBest(chi2, k=40)), ("preprocessor", MinMaxScaler((-1,1))),("clf", RandomForestClassifier())]
)
params={'selector__k':[35,40,45,50],
        'clf__n_estimators':[100,150,200,250],
        'clf__class_weight':['balanced'],
        'clf__max_features': ['log2'],
        'clf__min_samples_split': [3,5,8,10,15]
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
11,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.985954,0.004422,0.94469,0.008953
31,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.985845,0.004417,0.942764,0.010062
23,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.985789,0.004224,0.943926,0.011298
27,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.985784,0.004816,0.941982,0.010462
7,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.985737,0.004745,0.944576,0.00856
10,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.985701,0.004173,0.944959,0.010628
26,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.98569,0.004203,0.941724,0.009508
63,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.985658,0.004457,0.940055,0.014246
47,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.985646,0.004466,0.942115,0.011425
43,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.985469,0.004546,0.941725,0.012062


In [None]:
clf.set_params(**GS.best_params_)
clf.fit(x_train,y_train)

model_results = pd.DataFrame({"email_id": df_test.email_id, "predicted" : clf.predict_proba(x_test)[:,1]})
# model_results.to_csv("svm_minmax_full.csv", index=False)

### StandarScaler

In [None]:
clf = Pipeline(
    steps=[('selector', SelectKBest(chi2, k=40)),("preprocessor", StandardScaler()),("clf", RandomForestClassifier())]
)
params={'selector__k':[35,40,45,50],
        'clf__n_estimators':[100,150,200,250],
        'clf__class_weight':['balanced'],
        'clf__max_features': ['log2'],
        'clf__min_samples_split': [3,5,8,10,15]
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
3,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.986067,0.00494,0.943538,0.010816
27,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.985813,0.00449,0.946406,0.012692
15,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.985812,0.004313,0.942747,0.007597
14,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.985782,0.004394,0.945217,0.010038
2,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.985772,0.004751,0.946255,0.009378
7,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.985765,0.004142,0.94483,0.008838
23,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.985647,0.004491,0.941856,0.011674
43,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.985598,0.004658,0.943939,0.012645
6,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.985595,0.004259,0.94471,0.008644
18,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.985542,0.005002,0.942745,0.010665


In [None]:
clf.set_params(**GS.best_params_)
clf.fit(x_train,y_train)

model_results = pd.DataFrame({"email_id": df_test.email_id, "predicted" : clf.predict_proba(x_test)[:,1]})
# model_results.to_csv("lr_minmax_full.csv", index=False)

## GBM

### MinMax

In [None]:
clf = Pipeline(
    steps=[('selector', SelectKBest(chi2, k=40)),("preprocessor", MinMaxScaler((-1,1))),("clf", GradientBoostingClassifier())]
)
params={'selector__k':[35,40,45,50],
        'clf__n_estimators':[80,100,150,200,250],
        'clf__learning_rate':[0.01,0.1],
        'clf__max_features': ['log2'],
        'clf__min_samples_split': [5,6,8,10]
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
139,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986218,0.002668,0.946789,0.008597
98,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986205,0.002492,0.945869,0.007734
138,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986097,0.002514,0.946401,0.006499
99,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986093,0.001925,0.948862,0.007382
119,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986055,0.003226,0.946925,0.010885
118,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985999,0.003285,0.94276,0.011473
157,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985987,0.003131,0.948214,0.008488
158,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985846,0.002611,0.943026,0.007532
115,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985812,0.002902,0.947946,0.009701
159,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985748,0.00242,0.946543,0.008033


Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.


In [None]:
clf.set_params(**GS.best_params_)
clf.fit(x_train,y_train)

model_results = pd.DataFrame({"email_id": df_test.email_id, "predicted" : clf.predict_proba(x_test)[:,1]})
# model_results.to_csv("svm_minmax_full.csv", index=False)

### StandarScaler

In [None]:
clf = Pipeline(
    steps=[('selector', SelectKBest(chi2, k=40)),("preprocessor", StandardScaler()),("clf", GradientBoostingClassifier())]
)
params={'selector__k':[35,40,45,50],
        'clf__n_estimators':[80,100,150,200,250],
        'clf__learning_rate':[0.01,0.1],
        'clf__max_features': ['log2'],
        'clf__min_samples_split': [5,6,8,10]
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
139,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986182,0.002801,0.946917,0.009918
157,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986167,0.002912,0.945371,0.009818
158,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985983,0.002662,0.946126,0.007074
94,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985931,0.003024,0.945245,0.008646
113,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985903,0.002927,0.945488,0.010187
119,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985822,0.003564,0.946917,0.011621
97,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985781,0.002403,0.944579,0.010497
98,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985779,0.002991,0.945874,0.009087
153,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985737,0.003157,0.944201,0.009835
118,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985715,0.002369,0.944703,0.007891


In [None]:
clf.set_params(**{'clf__learning_rate': 0.1, 'clf__max_features': 'log2', 'clf__min_samples_split': 10, 'clf__n_estimators': 250})
clf.fit(x_train,y_train)

model_results = pd.DataFrame({"email_id": df_test.email_id, "predicted" : clf.predict_proba(x_test)[:,1]})
# model_results.to_csv("gbm_standard_full.csv", index=False)

  y = column_or_1d(y, warn=True)


# Porque si

In [None]:
from sklearn.decomposition import TruncatedSVD,PCA

In [None]:
ct = ColumnTransformer([("scaler", StandardScaler(), columns_no_freq),
                        ('tfidf',TfidfTransformer(),columns_freq),
                        ('pca',TruncatedSVD(n_iter=7, random_state=42),columns_freq)])

clf = Pipeline(
    steps=[("pre", ct),("clf", RandomForestClassifier())]
)

params={'pre__pca__n_components':[25,30,40],
        'clf__n_estimators':[100,150,200,250],
        'clf__class_weight':['balanced'],
        'clf__max_features': ['log2'],
        'clf__min_samples_split': [3,5,8,10,15]
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
9,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.986824,0.00346,0.94364,0.006358
11,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.986706,0.003175,0.945085,0.005199
13,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.986567,0.002729,0.944189,0.006791
21,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.98649,0.002874,0.943804,0.007114
19,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.986488,0.003063,0.944693,0.007147
10,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.986464,0.003244,0.940794,0.00835
17,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.986448,0.002894,0.944568,0.007197
16,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.986383,0.002781,0.944056,0.008218
6,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.98625,0.003273,0.94171,0.007525
46,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.986246,0.002863,0.944833,0.008959


In [None]:
clf.set_params(**	{'clf__class_weight': 'balanced', 'clf__max_features': 'log2', 'clf__min_samples_split': 3, 'clf__n_estimators': 250, 'pre__pca__n_components': 40})
clf.fit(x_train,y_train)

model_results = pd.DataFrame({"email_id": df_test.email_id, "predicted" : clf.predict_proba(x_test)[:,1]})
model_results.to_csv("rf_standard_tifdf_pca_full.csv", index=False)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)
