In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.decomposition import TruncatedSVD,PCA
from sklearn.model_selection import train_test_split 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler,StandardScaler
#from sklearn.pipeline import Pipeline
from sklearn.metrics import RocCurveDisplay,roc_auc_score
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.svm import SVC

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import VarianceThreshold

In [None]:
path = "/content/drive/MyDrive/Materias/MLEA_M/Coursework/"

df_train = pd.read_csv(path + "spam_dataset.csv")
df_test = pd.read_csv(path + "spam_dataset_test.csv")

In [None]:
x_test=df_test.drop("email_id",axis=1)

In [None]:
df_train2 = df_train.drop('email_id', axis=1)
x_train = df_train2.drop('spam',axis=1)
y_train = df_train2[['spam']]

In [None]:
columns_all= list(df_train.drop(['email_id','spam'],axis=1))
columns_freq = list(filter(lambda x: 'word_freq' in x, list(df_train.drop(['email_id','spam'], axis=1).columns)))
columns_no_freq = list(filter(lambda x: 'word_freq' not in x, list(df_train.drop(['email_id','spam'], axis=1).columns)))

In [None]:
y_train.shape, x_test.shape

((3220, 1), (1381, 57))

In [None]:
from imblearn.pipeline import Pipeline 
from imblearn.over_sampling import SMOTE,ADASYN
from collections import Counter

In [None]:
from imblearn.under_sampling import ClusterCentroids

In [None]:
ada = ADASYN(random_state=42)
X_res, y_res = ada.fit_resample(x_train, y_train)

In [None]:
y_train.value_counts()

spam
0       1941
1       1279
dtype: int64

In [None]:
y_res.value_counts()

spam
0       1941
1       1928
dtype: int64

# Full GBM


## SMOTE

In [None]:
clf = Pipeline(
    steps=[("preprocessor", StandardScaler()),('sampler',SMOTE(random_state=42)),("clf", GradientBoostingClassifier())]
)
params={'clf__n_estimators':[80,100,150,200,250],
        'clf__learning_rate':[0.01,0.1],
        'clf__max_features': ['log2'],
        'clf__min_samples_split': [5,6,8,10]
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
34,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986675,0.003057,0.949303,0.009034
33,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986621,0.002969,0.945126,0.009063
39,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986335,0.002564,0.948524,0.010595
28,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986071,0.003489,0.946682,0.00905
23,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985879,0.002646,0.946168,0.007249
24,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985477,0.001974,0.948134,0.009013
29,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985475,0.001858,0.946311,0.011687
38,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985472,0.002872,0.94681,0.005622
22,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985345,0.002974,0.94409,0.007937
37,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985113,0.003096,0.946804,0.009814


## ADASYN

In [None]:
clf = Pipeline(
    steps=[("preprocessor", StandardScaler()),('sampler',ADASYN(random_state=42)),("clf", GradientBoostingClassifier())]
)
params={'clf__n_estimators':[80,100,150,200,250],
        'clf__learning_rate':[0.01,0.1],
        'clf__max_features': ['log2'],
        'clf__min_samples_split': [5,6,8,10]
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
39,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986539,0.002102,0.948444,0.006919
38,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986429,0.002351,0.947405,0.00885
24,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986216,0.002077,0.952457,0.005523
29,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986215,0.001692,0.947546,0.006446
33,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986207,0.002102,0.947787,0.00482
28,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.98586,0.002132,0.945989,0.006054
27,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985824,0.002492,0.945833,0.011023
22,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985783,0.002625,0.944155,0.007529
23,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985647,0.002367,0.948827,0.005621
32,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985638,0.002784,0.947645,0.007459


In [None]:
clf.set_params(**{'clf__learning_rate': 0.1, 'clf__max_features': 'log2', 'clf__min_samples_split': 10, 'clf__n_estimators': 250})
clf.fit(x_train,y_train)

model_results = pd.DataFrame({"email_id": df_test.email_id, "predicted" : clf.predict_proba(x_test)[:,1]})
model_results.to_csv("rf_standard_res_full2.csv", index=False)# quedo mal guardado, deberia ser gbm

  y = column_or_1d(y, warn=True)


# CHI GBM

## SMOTE

In [None]:
clf = Pipeline(
    steps=[('selector', SelectKBest(chi2, k=40)),("preprocessor", StandardScaler()),
           ('sampler',SMOTE(random_state=42)),("clf", GradientBoostingClassifier())]
)
params={'selector__k':[35,40,45,50],
        'clf__n_estimators':[80,100,150,200,250],
        'clf__learning_rate':[0.01,0.1],
        'clf__max_features': ['log2'],
        'clf__min_samples_split': [5,6,8,10]
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
159,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986269,0.002696,0.948404,0.007374
158,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986182,0.002511,0.944882,0.009939
139,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986122,0.003141,0.947234,0.009314
118,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986122,0.002136,0.949693,0.007339
98,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986108,0.003307,0.948398,0.006822
135,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986075,0.003277,0.947604,0.008246
99,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986052,0.003083,0.94929,0.008266
119,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986002,0.002847,0.947475,0.008064
95,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985976,0.003053,0.947214,0.006975
138,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985835,0.00314,0.947863,0.006738


In [None]:
# {'clf__learning_rate': 0.1, 'clf__max_features': 'log2', 'clf__min_samples_split': 10, 'clf__n_estimators': 250, 'selector__k': 50}
# quita muy pocas caracteristicas, deja 50

## ADASYN

In [None]:
clf = Pipeline(
    steps=[('selector', SelectKBest(chi2, k=40)),("preprocessor", StandardScaler()),
           ('sampler',ADASYN(random_state=42)),("clf", GradientBoostingClassifier())]
)
params={'selector__k':[35,40,45,50],
        'clf__n_estimators':[80,100,150,200,250],
        'clf__learning_rate':[0.01,0.1],
        'clf__max_features': ['log2'],
        'clf__min_samples_split': [5,6,8,10]
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
118,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.98627,0.002745,0.949378,0.010261
99,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986105,0.002117,0.947298,0.00809
95,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985922,0.002759,0.949109,0.008616
97,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985886,0.002483,0.946527,0.0047
115,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985864,0.002782,0.948967,0.009902
155,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985863,0.003096,0.947286,0.008183
117,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985749,0.001992,0.943016,0.007276
158,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985683,0.002129,0.94678,0.009412
139,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985608,0.002057,0.948471,0.008121
159,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985605,0.002014,0.945334,0.006447


In [None]:
# {'clf__learning_rate': 0.1, 'clf__max_features': 'log2', 'clf__min_samples_split': 10, 'clf__n_estimators': 250, 'selector__k': 50}
# quita muy pocas caracteristicas, deja 45 o 50

# PCA GBM

## SMOTE

In [None]:
# PCA por partes parece ser mejor
ct = ColumnTransformer([("scaler", StandardScaler(), columns_all),
                        ('pca',TruncatedSVD(n_iter=7, random_state=42),columns_freq),
                        ('svd2',PCA(n_components=3, random_state=42),columns_no_freq[0:6]),
                        ('svd3',PCA(n_components=2, random_state=42),columns_no_freq[6:])])
clf = Pipeline(
    steps=[("pre", ct),('sampler',SMOTE(random_state=42)),("clf", GradientBoostingClassifier())]
)
params={'pre__pca__n_components':[25,30,40],
        'clf__n_estimators':[80,100,150,200,250],
        'clf__learning_rate':[0.01,0.1],
        'clf__max_features': ['log2'],
        'clf__min_samples_split': [5,6,8,10]
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
118,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.987587,0.002991,0.947598,0.007992
88,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.987169,0.002905,0.947991,0.007736
117,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.987059,0.002818,0.950052,0.008867
89,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.987018,0.00303,0.952516,0.005078
104,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986963,0.002533,0.947085,0.007062
72,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986935,0.002599,0.94824,0.007387
99,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986932,0.002153,0.946674,0.008915
115,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986884,0.002875,0.948249,0.009571
119,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986849,0.002581,0.946673,0.008377
114,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986832,0.002717,0.948365,0.009115


In [None]:
clf.set_params(**{'clf__learning_rate': 0.1, 'clf__max_features': 'log2', 'clf__min_samples_split': 10, 'clf__n_estimators': 250, 'pre__pca__n_components': 30})
clf.fit(x_train,y_train)

model_results = pd.DataFrame({"email_id": df_test.email_id, "predicted" : clf.predict_proba(x_test)[:,1]})
model_results.to_csv("gbm_standard_pca_res.csv", index=False)

  y = column_or_1d(y, warn=True)


## ADASYN

In [None]:
ct = ColumnTransformer([("scaler", StandardScaler(), columns_all),
                        ('pca',TruncatedSVD(n_iter=7, random_state=42),columns_freq),
                        ('svd2',PCA(n_components=3, random_state=42),columns_no_freq[0:6]),
                        ('svd3',PCA(n_components=2, random_state=42),columns_no_freq[6:])])
clf = Pipeline(
    steps=[("pre", ct),('sampler',ADASYN(random_state=42)),("clf", GradientBoostingClassifier())]
)
params={'pre__pca__n_components':[28,30,35,38],
        'clf__n_estimators':[320,350,380,400],
        'clf__learning_rate':[0.1],
        'clf__max_features': ['log2'],
        'clf__min_samples_split': [5,8,10]
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
29,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.98765,0.002511,0.948399,0.009209
6,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.987562,0.003205,0.948399,0.009977
9,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.987559,0.001814,0.949438,0.00741
0,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.987422,0.002133,0.947104,0.00843
5,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.987233,0.002418,0.948274,0.004051
4,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.987217,0.002331,0.949308,0.006612
38,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.987195,0.002514,0.948275,0.004588
8,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.987192,0.001981,0.950335,0.007007
14,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.987158,0.002055,0.949826,0.004366
46,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.987129,0.002862,0.94762,0.00753


In [None]:
#clf.set_params(**{'clf__learning_rate': 0.1, 'clf__max_features': 'log2', 'clf__min_samples_split': 10, 'clf__n_estimators': 250, 'pre__pca__n_components': 25})
clf.set_params(**{'clf__learning_rate': 0.1, 'clf__max_features': 'log2', 'clf__min_samples_split': 5, 'clf__n_estimators': 380, 'pre__pca__n_components': 30})

clf.fit(x_train,y_train)

model_results = pd.DataFrame({"email_id": df_test.email_id, "predicted" : clf.predict_proba(x_test)[:,1]})
model_results.to_csv("gbm_standard_pca_res6.csv", index=False)

  y = column_or_1d(y, warn=True)


# TFIDF PCA GBM

## SMOTE

In [None]:
ct = ColumnTransformer([("scaler", StandardScaler(), columns_no_freq),
                        ('tfidf',TfidfTransformer(),columns_freq),
                        ('pca',TruncatedSVD(n_iter=7, random_state=42),columns_freq)])
clf = Pipeline(
    steps=[("pre", ct),('sampler',SMOTE(random_state=42)),("clf", GradientBoostingClassifier())]
)
params={'pre__pca__n_components':[25,30,40],
        'clf__n_estimators':[80,100,150,200,250],
        'clf__learning_rate':[0.01,0.1],
        'clf__max_features': ['log2'],
        'clf__min_samples_split': [5,6,8,10]
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
73,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986635,0.002761,0.952148,0.008612
119,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986581,0.003246,0.947218,0.008552
99,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986579,0.002841,0.952137,0.007733
101,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986501,0.003668,0.949155,0.011243
74,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986429,0.003343,0.948904,0.010669
113,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986397,0.003522,0.948383,0.013563
100,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986397,0.002778,0.950468,0.010828
84,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986355,0.002595,0.949804,0.008742
88,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986292,0.003045,0.946956,0.009608
87,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986169,0.002817,0.951507,0.00816


In [None]:
clf.set_params(**{'clf__learning_rate': 0.1, 'clf__max_features': 'log2', 'clf__min_samples_split': 5, 'clf__n_estimators': 250, 'pre__pca__n_components': 30})
clf.fit(x_train,y_train)

model_results = pd.DataFrame({"email_id": df_test.email_id, "predicted" : clf.predict_proba(x_test)[:,1]})
model_results.to_csv("gbm_standard_pca_tfidf_res.csv", index=False)

  y = column_or_1d(y, warn=True)


## ADASYN

In [None]:
ct = ColumnTransformer([("scaler", StandardScaler(), columns_no_freq),
                        ('tfidf',TfidfTransformer(),columns_freq),
                        ('pca',TruncatedSVD(n_iter=7, random_state=42),columns_freq)])
clf = Pipeline(
    steps=[("pre", ct),('sampler',ADASYN(random_state=42)),("clf", GradientBoostingClassifier())]
)
params={'pre__pca__n_components':[25,30,40],
        'clf__n_estimators':[80,100,150,200,250],
        'clf__learning_rate':[0.01,0.1],
        'clf__max_features': ['log2'],
        'clf__min_samples_split': [5,6,8,10]
        }
GS = GridSearchCV(clf,param_grid = params, scoring = ['roc_auc','balanced_accuracy'], n_jobs=-1, cv=5, refit=False)

In [None]:
GS.fit(x_train,y_train)
df_results = pd.DataFrame(GS.cv_results_)
df_results[['params','mean_test_roc_auc', 'std_test_roc_auc','mean_test_balanced_accuracy','std_test_balanced_accuracy']].sort_values(by='mean_test_roc_auc',ascending=False).head(10)

Unnamed: 0,params,mean_test_roc_auc,std_test_roc_auc,mean_test_balanced_accuracy,std_test_balanced_accuracy
118,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986708,0.002861,0.949478,0.004581
99,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986601,0.002464,0.948426,0.008626
119,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986448,0.00254,0.948416,0.005787
83,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986359,0.002406,0.947006,0.00807
115,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986319,0.002707,0.949207,0.004491
72,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.98624,0.002986,0.950124,0.008717
102,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986191,0.0022,0.949085,0.00762
73,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986079,0.002817,0.949348,0.007068
85,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.986051,0.00222,0.95001,0.006089
98,"{'clf__learning_rate': 0.1, 'clf__max_features...",0.985947,0.002776,0.94299,0.008731


In [None]:
clf.set_params(**{'clf__learning_rate': 0.1, 'clf__max_features': 'log2', 'clf__min_samples_split': 10, 'clf__n_estimators': 250, 'pre__pca__n_components': 30})
clf.fit(x_train,y_train)

model_results = pd.DataFrame({"email_id": df_test.email_id, "predicted" : clf.predict_proba(x_test)[:,1]})
model_results.to_csv("gbm_standard_pca_tfidf_res2.csv", index=False)

  y = column_or_1d(y, warn=True)
