In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.decomposition import TruncatedSVD,PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import RocCurveDisplay,roc_auc_score
from sklearn.model_selection import GridSearchCV

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.svm import SVC

In [3]:
path = "/content/drive/MyDrive/Materias/MLEA_M/Coursework/"

df_train = pd.read_csv(path + "spam_dataset.csv")
df_test = pd.read_csv(path + "spam_dataset_test.csv")

In [4]:
x_test=df_test.drop("email_id",axis=1)

In [5]:
df_train2 = df_train.drop('email_id', axis=1)
x = df_train2.drop('spam',axis=1)
y = df_train2[['spam']]
x_train, x_val, y_train, y_val = train_test_split(df_train2.drop('spam',axis=1), df_train2[['spam']], test_size=0.3, random_state=12) 

In [6]:
columns_all= list(df_train.drop(['email_id','spam'],axis=1))
columns_freq = list(filter(lambda x: 'word_freq' in x, list(df_train.drop(['email_id','spam'], axis=1).columns)))
columns_no_freq = list(filter(lambda x: 'word_freq' not in x, list(df_train.drop(['email_id','spam'], axis=1).columns)))

In [None]:
y_train.shape, x_test.shape

((2254, 1), (1381, 57))

# PCA


In [15]:
ct = ColumnTransformer([("scaler", StandardScaler(), columns_all),
                        ('pca',TruncatedSVD(n_iter=7, random_state=42),columns_freq),
                        ('pca2',PCA(n_components=3, random_state=42),columns_no_freq[0:6]),
                        ('pca3',PCA(n_components=2, random_state=42),columns_no_freq[6:])])

## LR

In [8]:
clf = Pipeline(
    steps=[("pre", ct),("clf", LogisticRegression(max_iter=500))]
)
params={'pre__pca__n_components':[25,30,40],
        'clf__C':[0.01,0.1,1,10,80,100,300],
        'clf__penalty':['l1', 'l2'],
        'clf__class_weight':['balanced'],
        'clf__solver': ['liblinear'],
        }

GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
17,"{'clf__C': 1, 'clf__class_weight': 'balanced',...",0.968533,0.005543,0.920046,0.011068
15,"{'clf__C': 1, 'clf__class_weight': 'balanced',...",0.968406,0.005782,0.920223,0.010874
16,"{'clf__C': 1, 'clf__class_weight': 'balanced',...",0.968398,0.005751,0.920046,0.011068
12,"{'clf__C': 1, 'clf__class_weight': 'balanced',...",0.96835,0.005471,0.920987,0.009162
13,"{'clf__C': 1, 'clf__class_weight': 'balanced',...",0.968309,0.005503,0.920987,0.009162
14,"{'clf__C': 1, 'clf__class_weight': 'balanced',...",0.968136,0.005539,0.920796,0.009789
22,"{'clf__C': 10, 'clf__class_weight': 'balanced'...",0.96709,0.006458,0.917294,0.012084
11,"{'clf__C': 0.1, 'clf__class_weight': 'balanced...",0.967049,0.004223,0.916332,0.00551
10,"{'clf__C': 0.1, 'clf__class_weight': 'balanced...",0.966971,0.004236,0.916699,0.005641
9,"{'clf__C': 0.1, 'clf__class_weight': 'balanced...",0.96695,0.004191,0.916699,0.005641


## SVM

In [None]:
clf = Pipeline(
    steps=[("pre", ct),("clf", SVC())]
)
params={'pre__pca__n_components':[25,30,40],
        'clf__C':[0.1,1,5,10,20,50,80],
        'clf__kernel':['rbf'],
        'clf__class_weight':['balanced'],
        'clf__gamma': ['auto','scale'],
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
41,"{'clf__C': 80, 'clf__class_weight': 'balanced'...",0.922036,0.010698,0.848081,0.007423
40,"{'clf__C': 80, 'clf__class_weight': 'balanced'...",0.921897,0.01081,0.848081,0.007423
39,"{'clf__C': 80, 'clf__class_weight': 'balanced'...",0.92172,0.010864,0.848268,0.007237
14,"{'clf__C': 5, 'clf__class_weight': 'balanced',...",0.909762,0.009632,0.832969,0.015681
20,"{'clf__C': 10, 'clf__class_weight': 'balanced'...",0.908303,0.010303,0.832412,0.014629
13,"{'clf__C': 5, 'clf__class_weight': 'balanced',...",0.908274,0.010054,0.832409,0.014707
12,"{'clf__C': 5, 'clf__class_weight': 'balanced',...",0.907932,0.010309,0.832222,0.013609
35,"{'clf__C': 50, 'clf__class_weight': 'balanced'...",0.906602,0.011436,0.8335,0.007642
34,"{'clf__C': 50, 'clf__class_weight': 'balanced'...",0.906602,0.011354,0.8335,0.007642
33,"{'clf__C': 50, 'clf__class_weight': 'balanced'...",0.906368,0.011394,0.832573,0.00859


## RF

In [None]:
clf = Pipeline(
    steps=[("pre", ct),("clf", RandomForestClassifier())]
)

params={'pre__pca__n_components':[25,30,40],
        'clf__n_estimators':[50,100,150,200,250],
        'clf__class_weight':['balanced'],
        'clf__max_features': ['sqrt','log2'],
        'clf__min_samples_split': [3,5,8,10]
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
4,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.983071,0.001861,0.941063,0.00725
73,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.982721,0.001913,0.93807,0.0063
67,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.982666,0.002094,0.939738,0.006587
85,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.982592,0.002887,0.939577,0.010425
66,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.982496,0.002245,0.938072,0.008664
87,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.982465,0.001909,0.938069,0.007762
89,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.982391,0.002217,0.937718,0.009962
70,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.982341,0.002047,0.937704,0.007992
112,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.982325,0.002028,0.936982,0.005927
72,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.982226,0.002024,0.939934,0.00951


In [None]:
clf.set_params(**	{'clf__class_weight': 'balanced', 'clf__max_features': 'log2', 'clf__min_samples_split': 3, 'clf__n_estimators': 250, 'pre__pca__n_components': 40})
clf.fit(x_train,y_train)

model_results = pd.DataFrame({"email_id": df_test.email_id, "predicted" : clf.predict_proba(x_test)[:,1]})
# model_results.to_csv("rf_standard_tifdf_pca_full.csv", index=False)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


## GBM

In [16]:
clf = Pipeline(
    steps=[("pre", ct),("clf", GradientBoostingClassifier())]
)
params={'pre__pca__n_components':[25,30,40],
        'clf__n_estimators':[80,100,150,200,250],
        'clf__learning_rate':[0.01,0.1],
        'clf__max_features': ['log2'],
        'clf__min_samples_split': [5,6,8,10]
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
70,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985709,0.003534,0.947539,0.010069
104,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985378,0.003551,0.944389,0.004235
87,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985299,0.003075,0.946246,0.007967
101,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985139,0.003745,0.944393,0.009193
118,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985107,0.004313,0.944027,0.003516
119,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985053,0.004206,0.943645,0.007632
73,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985009,0.00382,0.945329,0.005624
99,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.984859,0.004088,0.946794,0.008217
72,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.984836,0.005048,0.946449,0.006241
102,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.984826,0.003584,0.944406,0.008293


In [17]:
from sklearn import set_config
set_config(display="diagram")

In [19]:
clf

# PCA a todo


In [None]:
ct = ColumnTransformer([("scaler", StandardScaler(), columns_all),
                        ('pca',TruncatedSVD(n_iter=7, random_state=42),columns_freq)])

## LR

In [None]:
clf = Pipeline(
    steps=[("pre", ct),("clf", LogisticRegression(max_iter=500))]
)
params={'pre__pca__n_components':[25,30,40],
        'clf__C':[0.01,0.1,1,10,80,100,300],
        'clf__penalty':['l1', 'l2'],
        'clf__class_weight':['balanced'],
        'clf__solver': ['liblinear'],
        }

GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
15,"{'clf__C': 1, 'clf__class_weight': 'balanced',...",0.968266,0.005503,0.920974,0.009764
12,"{'clf__C': 1, 'clf__class_weight': 'balanced',...",0.968256,0.005187,0.920056,0.010094
16,"{'clf__C': 1, 'clf__class_weight': 'balanced',...",0.96825,0.005522,0.920974,0.009764
13,"{'clf__C': 1, 'clf__class_weight': 'balanced',...",0.968243,0.005193,0.920056,0.010094
17,"{'clf__C': 1, 'clf__class_weight': 'balanced',...",0.968229,0.00556,0.920605,0.010015
14,"{'clf__C': 1, 'clf__class_weight': 'balanced',...",0.968223,0.005204,0.920056,0.010094
11,"{'clf__C': 0.1, 'clf__class_weight': 'balanced...",0.966639,0.003619,0.91578,0.007804
9,"{'clf__C': 0.1, 'clf__class_weight': 'balanced...",0.966611,0.00364,0.91578,0.007601
10,"{'clf__C': 0.1, 'clf__class_weight': 'balanced...",0.966582,0.003641,0.91578,0.007601
21,"{'clf__C': 10, 'clf__class_weight': 'balanced'...",0.96656,0.007139,0.919142,0.012411


## SVM

In [None]:
clf = Pipeline(
    steps=[("pre", ct),("clf", SVC())]
)
params={'pre__pca__n_components':[25,30,40],
        'clf__C':[0.1,1,5,10,20,50,80],
        'clf__kernel':['rbf'],
        'clf__class_weight':['balanced'],
        'clf__gamma': ['auto','scale'],
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
12,"{'clf__C': 5, 'clf__class_weight': 'balanced',...",0.972467,0.010375,0.926428,0.007349
14,"{'clf__C': 5, 'clf__class_weight': 'balanced',...",0.972258,0.009893,0.929033,0.00575
15,"{'clf__C': 5, 'clf__class_weight': 'balanced',...",0.972254,0.010023,0.928664,0.006276
16,"{'clf__C': 5, 'clf__class_weight': 'balanced',...",0.972189,0.009837,0.928296,0.005688
17,"{'clf__C': 5, 'clf__class_weight': 'balanced',...",0.972172,0.009872,0.928295,0.005863
13,"{'clf__C': 5, 'clf__class_weight': 'balanced',...",0.972168,0.010057,0.927355,0.005827
21,"{'clf__C': 10, 'clf__class_weight': 'balanced'...",0.972099,0.010647,0.924016,0.007973
20,"{'clf__C': 10, 'clf__class_weight': 'balanced'...",0.972079,0.010504,0.923647,0.008304
22,"{'clf__C': 10, 'clf__class_weight': 'balanced'...",0.972022,0.010561,0.923647,0.008304
23,"{'clf__C': 10, 'clf__class_weight': 'balanced'...",0.971972,0.010507,0.924015,0.00793


## RF

In [None]:
clf = Pipeline(
    steps=[("pre", ct),("clf", RandomForestClassifier())]
)

params={'pre__pca__n_components':[25,30,40],
        'clf__n_estimators':[50,100,150,200,250],
        'clf__class_weight':['balanced'],
        'clf__max_features': ['log2'],
        'clf__min_samples_split': [3,5,8,10,15]
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
24,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.983988,0.002931,0.939577,0.01049
10,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.983909,0.003216,0.939184,0.007703
26,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.983883,0.003806,0.939403,0.009511
27,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.98384,0.003326,0.941426,0.008478
23,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.9838,0.003204,0.939576,0.010128
13,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.983791,0.003267,0.941985,0.005337
42,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.983774,0.002786,0.943284,0.01276
12,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.983739,0.003273,0.939397,0.009921
14,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.983672,0.003308,0.939203,0.006684
9,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.983665,0.003074,0.940328,0.009266


In [None]:
clf.set_params(**	{'clf__class_weight': 'balanced', 'clf__max_features': 'log2', 'clf__min_samples_split': 3, 'clf__n_estimators': 250, 'pre__pca__n_components': 40})
clf.fit(x_train,y_train)

model_results = pd.DataFrame({"email_id": df_test.email_id, "predicted" : clf.predict_proba(x_test)[:,1]})
# model_results.to_csv("rf_standard_tifdf_pca_full.csv", index=False)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


## GBM

In [13]:
clf = Pipeline(
    steps=[("pre", ct),("clf", GradientBoostingClassifier())]
)
params={'pre__pca__n_components':[25,30,40],
        'clf__n_estimators':[80,100,150,200,250],
        'clf__learning_rate':[0.01,0.1],
        'clf__max_features': ['log2'],
        'clf__min_samples_split': [5,6,8,10]
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
74,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985315,0.004122,0.944225,0.006696
119,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985288,0.003003,0.944035,0.004437
87,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.98517,0.004549,0.945705,0.006786
118,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985071,0.004892,0.945331,0.00862
102,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985059,0.004045,0.948659,0.007912
70,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985037,0.004019,0.942182,0.004402
116,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.984948,0.004054,0.94588,0.011129
104,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.984933,0.00364,0.945328,0.007416
84,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.984837,0.004064,0.942364,0.004184
99,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.984818,0.004941,0.943476,0.005987


# TF-IDF PCA


In [None]:
ct = ColumnTransformer([("scaler", StandardScaler(), columns_no_freq),
                        ('tfidf',TfidfTransformer(),columns_freq),
                        ('pca',TruncatedSVD(n_iter=7, random_state=42),columns_freq)])

## LR

In [None]:
clf = Pipeline(
    steps=[("pre", ct),("clf", LogisticRegression(max_iter=500))]
)
params={'pre__pca__n_components':[40],
        'clf__C':[0.01,0.1,1,80,100],
        'clf__penalty':['l1', 'l2'],
        'clf__class_weight':['balanced'],
        'clf__solver': ['liblinear'],
        }

GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_) 
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
5,"{'clf__C': 1, 'clf__class_weight': 'balanced',...",0.971517,0.005692,0.924139,0.009491
4,"{'clf__C': 1, 'clf__class_weight': 'balanced',...",0.970256,0.006358,0.921372,0.01018
7,"{'clf__C': 80, 'clf__class_weight': 'balanced'...",0.968602,0.009014,0.920982,0.01133
9,"{'clf__C': 100, 'clf__class_weight': 'balanced...",0.968171,0.009249,0.920982,0.01133
3,"{'clf__C': 0.1, 'clf__class_weight': 'balanced...",0.967567,0.00407,0.923777,0.007915
6,"{'clf__C': 80, 'clf__class_weight': 'balanced'...",0.964879,0.011185,0.919312,0.012921
8,"{'clf__C': 100, 'clf__class_weight': 'balanced...",0.964689,0.011286,0.919312,0.012921
2,"{'clf__C': 0.1, 'clf__class_weight': 'balanced...",0.959843,0.00596,0.914901,0.008117
1,"{'clf__C': 0.01, 'clf__class_weight': 'balance...",0.959209,0.004811,0.915143,0.005451
0,"{'clf__C': 0.01, 'clf__class_weight': 'balance...",0.92668,0.013045,0.854362,0.015056


## SVM

In [None]:
clf = Pipeline(
    steps=[("pre", ct),("clf", SVC())]
)
params={'pre__pca__n_components':[25,30,40],
        'clf__C':[0.1,1,5,10,20,50,80],
        'clf__kernel':['rbf'],
        'clf__class_weight':['balanced'],
        'clf__gamma': ['auto','scale'],
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
17,"{'clf__C': 5, 'clf__class_weight': 'balanced',...",0.974961,0.010233,0.930355,0.008097
23,"{'clf__C': 10, 'clf__class_weight': 'balanced'...",0.974198,0.010156,0.929417,0.005856
15,"{'clf__C': 5, 'clf__class_weight': 'balanced',...",0.974105,0.011683,0.93127,0.006981
16,"{'clf__C': 5, 'clf__class_weight': 'balanced',...",0.973715,0.01136,0.930718,0.007358
21,"{'clf__C': 10, 'clf__class_weight': 'balanced'...",0.973275,0.012118,0.932013,0.006672
22,"{'clf__C': 10, 'clf__class_weight': 'balanced'...",0.973247,0.011601,0.929227,0.005561
11,"{'clf__C': 1, 'clf__class_weight': 'balanced',...",0.973234,0.00902,0.926471,0.007185
14,"{'clf__C': 5, 'clf__class_weight': 'balanced',...",0.973083,0.010449,0.929455,0.009566
20,"{'clf__C': 10, 'clf__class_weight': 'balanced'...",0.972993,0.009925,0.930378,0.008175
12,"{'clf__C': 5, 'clf__class_weight': 'balanced',...",0.972707,0.011688,0.928343,0.006619


## RF

In [None]:
ct = ColumnTransformer([("scaler", StandardScaler(), columns_no_freq),
                        ('tfidf',TfidfTransformer(),columns_freq),
                        ('pca',TruncatedSVD(n_iter=7, random_state=42),columns_freq)])

clf = Pipeline(
    steps=[("pre", ct),("clf", RandomForestClassifier())]
)

params={'pre__pca__n_components':[25,30,40],
        'clf__n_estimators':[100,150,200,250],
        'clf__class_weight':['balanced'],
        'clf__max_features': ['log2'],
        'clf__min_samples_split': [3,5,8,10,15]
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
21,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.984554,0.003373,0.939949,0.008354
11,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.984511,0.00382,0.942541,0.01032
9,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.984419,0.003606,0.945143,0.007353
28,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.984374,0.003588,0.942555,0.007542
22,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.984349,0.003442,0.939209,0.009904
18,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.984284,0.003717,0.939581,0.008393
4,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.984267,0.002896,0.939007,0.005619
6,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.984251,0.003835,0.940505,0.009596
3,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.984249,0.00318,0.942555,0.011348
29,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.984241,0.003612,0.939575,0.006585


In [None]:
clf.set_params(**	{'clf__class_weight': 'balanced', 'clf__max_features': 'log2', 'clf__min_samples_split': 5, 'clf__n_estimators': 250, 'pre__pca__n_components': 25})
clf.fit(x_train,y_train)

model_results = pd.DataFrame({"email_id": df_test.email_id, "predicted" : clf.predict_proba(x_test)[:,1]})
model_results.to_csv("rf_standard_tifdf_pca_full.csv", index=False)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


## GBM

In [None]:
ct = ColumnTransformer([("scaler", StandardScaler(), columns_no_freq),
                        ('tfidf',TfidfTransformer(),columns_freq),
                        ('pca',TruncatedSVD(n_iter=7, random_state=42),columns_freq)])
clf = Pipeline(
    steps=[("pre", ct),("clf", GradientBoostingClassifier())]
)
params={'pre__pca__n_components':[25,30,40],
        'clf__n_estimators':[80,100,150,200,250],
        'clf__learning_rate':[0.01,0.1],
        'clf__max_features': ['log2'],
        'clf__min_samples_split': [5,6,8,10]
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
118,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.98558,0.004317,0.944034,0.007253
71,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985466,0.003077,0.945888,0.006639
74,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985204,0.00391,0.946439,0.007452
117,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985079,0.004543,0.947924,0.004294
102,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.984918,0.004992,0.946827,0.01083
101,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.984826,0.004093,0.949047,0.008171
119,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.984808,0.00405,0.944774,0.00811
68,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.984569,0.003926,0.942555,0.007211
70,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.984569,0.003795,0.943862,0.011196
103,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.984505,0.004301,0.941448,0.007203


Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.


In [None]:
clf.set_params(**{'clf__learning_rate': 0.1, 'clf__max_features': 'log2', 'clf__min_samples_split': 5, 'clf__n_estimators': 200, 'pre__pca__n_components': 40})
clf.fit(x_train,y_train)

model_results = pd.DataFrame({"email_id": df_test.email_id, "predicted" : clf.predict_proba(x_test)[:,1]})
model_results.to_csv("gbm_standard_tifdf_pca_full.csv", index=False)

  y = column_or_1d(y, warn=True)


# Variance selection

In [None]:
selector = Pipeline(
    steps=[("preprocessor", MinMaxScaler((-1,1)))]
)
selector.fit(x_train)

Pipeline(steps=[('preprocessor', MinMaxScaler(feature_range=(-1, 1)))])

In [None]:
x_train

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,word_freq_conference,char_freq_semicolon,char_freq_bracket,char_freq_sqbracket,char_freq_excl,char_freq_dollar,char_freq_hash,capital_run_length_average,capital_run_length_longest,capital_run_length_total
2257,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,1.01,...,0.0,0.000,0.000,0.312,0.000,0.000,0.000,4.030,28,133
2269,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,1.81,0.00,...,0.0,0.000,0.316,0.000,0.000,0.000,0.000,1.125,2,9
105,0.00,0.00,0.00,0.0,0.51,0.00,0.00,0.00,0.00,0.00,...,0.0,0.000,0.000,0.000,0.281,0.000,0.000,1.363,5,45
2309,0.00,0.00,0.20,0.0,0.40,0.20,0.00,0.00,0.00,0.00,...,0.0,0.000,0.000,0.000,0.382,0.223,0.478,7.538,55,490
1215,0.71,0.00,0.11,0.0,0.47,0.11,0.00,0.59,0.71,2.86,...,0.0,0.072,0.127,0.000,0.418,0.254,0.018,9.705,148,1514
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1987,0.05,0.05,0.40,0.0,0.34,0.00,0.00,0.00,0.57,0.05,...,0.0,0.019,0.099,0.000,0.099,0.079,0.009,4.881,95,1313
1283,0.73,0.00,0.36,0.0,0.36,0.36,1.10,0.00,0.00,0.00,...,0.0,0.000,0.186,0.000,0.435,0.062,0.000,4.411,190,300
1414,0.25,0.00,0.51,0.0,0.25,1.28,0.00,0.00,0.77,0.51,...,0.0,0.000,0.086,0.000,0.260,0.173,0.000,3.298,16,287
1691,0.00,0.17,0.00,0.0,0.00,0.00,0.17,0.17,0.00,0.17,...,0.0,0.108,0.216,0.061,0.046,0.030,0.000,4.259,85,3318


In [None]:
np.percentile(np.var(selector.transform(x_train),axis=0),q=[25,50,75])

array([0.00552808, 0.01183875, 0.024645  ])

In [None]:
clf = Pipeline(
    steps=[("preprocessor", MinMaxScaler((-1,1))), ('selector', VarianceThreshold()), ("clf", RandomForestClassifier())]
)

In [None]:
params={'selector__threshold':[0.0118,0.0055],
        'clf__n_estimators':[80,100,150,200,250],
        'clf__class_weight':['balanced'],
        'clf__max_features': ['log2'],
        'clf__min_samples_split': [3,5,6,8,10]
        }
GS = GridSearchCV(clf,param_grid = params, scoring = 'roc_auc', n_jobs=-1, cv=5)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_score','std_test_score']].sort_values(by='mean_test_score',ascending=False)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Unnamed: 0,params,mean_test_score,std_test_score
27,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.981846,0.00488
29,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.981537,0.004692
19,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.98132,0.004781
15,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.981279,0.004354
39,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.98127,0.004501
23,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.981263,0.00504
45,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.981246,0.004471
7,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.981204,0.005038
21,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.981158,0.004949
17,"{'clf__class_weight': 'balanced', 'clf__max_fe...",0.981148,0.004616
