## Preprocessing

In [69]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')  # optional, for new versions


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aly98\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aly98\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\aly98\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [70]:
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [71]:
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [72]:
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)

In [73]:
df.rename(columns={'v1':'target', 'v2':'text'}, inplace=True)

In [74]:
df.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [75]:
df.duplicated().sum()

403

In [76]:
df.shape

(5572, 2)

In [77]:
df.drop_duplicates(inplace=True)
df.shape

(5169, 2)

In [78]:
df.isnull().sum()

target    0
text      0
dtype: int64

In [79]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df.target)

df.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [80]:
from nltk.stem.porter import PorterStemmer
import string

ps = PorterStemmer()

In [81]:
# Lowercase transformation and text preprocessing function

def transform_text(text): 

    # make text to lower
    text = text.lower()
    
    text = nltk.word_tokenize(text)
    y = []

    # Remove special character
    for i in text:
        if i.isalnum():
            y.append(i)

    text = y[:]
    y.clear()

    # Loop through the tokens and remove stopwords and puctuation
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)

    text = y.copy()
    y.clear()


    # transform each word to each root word using stemming technique

    for i in text:
        y.append(ps.stem(i))
    
    # join all the word into single string
    return  " ".join(y)


In [82]:

transform_text('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...')


'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

In [83]:
df['text'] = df['text'].apply(transform_text)
df.head()

Unnamed: 0,target,text
0,0,go jurong point crazi avail bugi n great world...
1,0,ok lar joke wif u oni
2,1,free entri 2 wkli comp win fa cup final tkt 21...
3,0,u dun say earli hor u c alreadi say
4,0,nah think goe usf live around though


## Feature Engineering

In [84]:
# Convert 
from sklearn.feature_extraction.text import TfidfVectorizer
tfid = TfidfVectorizer(max_features = 500, ngram_range=(1, 2))

X = tfid.fit_transform(df['text']).toarray() 
y = df.target.values


## Model Selection

In [85]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=2)

In [86]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [87]:
svc = SVC(kernel= "sigmoid", gamma  = 1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth = 5)
lrc = LogisticRegression(solver = 'liblinear', penalty = 'l1')
rfc = RandomForestClassifier(n_estimators = 50, random_state = 2 )
abc = AdaBoostClassifier(n_estimators = 50, random_state = 2)
bc = BaggingClassifier(n_estimators = 50, random_state = 2)
etc = ExtraTreesClassifier(n_estimators = 50, random_state = 2)
gbdt = GradientBoostingClassifier(n_estimators = 50, random_state = 2)    
xgb  = XGBClassifier(n_estimators = 50, random_state = 2)

In [88]:
clfs = {
    "SVC":svc,
    "KNN":knc,
    "Naive Baye":mnb,
    "Decision Tree":dtc,
    "Logistic Reg": lrc,
    "Random Forest":rfc,
    "AdaBoost Clf":abc,
    "Bagging Clf":bc,
    "Extra Tree Clf":etc,
    "Gradient Boosting":gbdt,
    "XGBoost":xgb

}

In [89]:
def train_classifier(clf, X_train, y_train):
    model = clf.fit(X_train, y_train)
    return model

In [90]:
from sklearn.model_selection import cross_val_score,StratifiedKFold

def metrics_report(clfs: dict, X_train: np.ndarray, y_train: np.ndarray) -> pd.DataFrame:
    """Generate cross-validated metrics (accuracy, precision, recall, f1) for multiple classifiers"""
    
    stf = StratifiedKFold(n_splits=5)
    results = {'model': [], 'accuracy': [], 'precision': [], 'recall': [], 'f1': []}

    for name, clf in clfs.items():
        results['model'].append(name)
        results['accuracy'].append(cross_val_score(clf, X_train, y_train, cv=stf, scoring='accuracy', n_jobs=-1).mean())
        results['precision'].append(cross_val_score(clf, X_train, y_train, cv=stf, scoring='precision', n_jobs=-1).mean())
        results['recall'].append(cross_val_score(clf, X_train, y_train, cv=stf, scoring='recall', n_jobs=-1).mean())
        results['f1'].append(cross_val_score(clf, X_train, y_train, cv=stf, scoring='f1', n_jobs=-1).mean())

    return pd.DataFrame(results)



In [91]:
df_reports = metrics_report(clfs,X_train, y_train)

In [92]:
df_reports.sort_values(by=['accuracy','precision'], ascending=False)

Unnamed: 0,model,accuracy,precision,recall,f1
8,Extra Tree Clf,0.972672,0.936402,0.838835,0.883592
5,Random Forest,0.971705,0.950377,0.817476,0.87659
2,Naive Baye,0.970979,0.951499,0.807767,0.873118
0,SVC,0.969287,0.940704,0.803883,0.866556
10,XGBoost,0.962999,0.926897,0.763107,0.835077
4,Logistic Reg,0.960822,0.935054,0.735922,0.823197
7,Bagging Clf,0.957678,0.850402,0.807767,0.82608
9,Gradient Boosting,0.9526,0.952394,0.652427,0.772519
3,Decision Tree,0.935913,0.832402,0.61165,0.700037
6,AdaBoost Clf,0.923821,0.806673,0.51068,0.623685


### Select best model and evaluate on test data

In [93]:
from sklearn.model_selection import train_test_split,cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score



X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=2)
clf = RandomForestClassifier(n_estimators=200)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f' accuracy: {accuracy} \n precision: {precision} \n recall: {recall} \n f1: {f1}')


 accuracy: 0.9700193423597679 
 precision: 0.9349593495934959 
 recall: 0.8333333333333334 
 f1: 0.8812260536398467


## Hyperparameter tunning

In [98]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [200, 500],
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False],

}

rf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(
    rf,
    param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)
print("Best F1 score:", grid_search.best_score_)


Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best parameters: {'bootstrap': False, 'criterion': 'gini', 'max_features': 'log2', 'n_estimators': 500}
Best F1 score: 0.8887960224834386


### The above hyperparameter tunning output

Fitting 5 folds for each of 16 candidates, totalling 80 fits

Best parameters: {  
    'bootstrap': False,  
     'criterion': 'gini',  
      'max_features': 'log2',  
       'n_estimators': 500}

Best F1 score: 0.8887960224834386

In [99]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score



X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=2)
clf = RandomForestClassifier(
    bootstrap = False, 
    criterion='gini',  
    max_features= 'log2',  
    n_estimators=500)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f' accuracy: {accuracy} \n precision: {precision} \n recall: {recall} \n f1: {f1}')


 accuracy: 0.9758220502901354 
 precision: 0.9747899159663865 
 recall: 0.8405797101449275 
 f1: 0.9027237354085603


In [117]:
df = pd.read_csv(r'C:\Users\aly98\Desktop\Mlops\Practice\data\processed\train_tfidf.csv')
df.head()

Unnamed: 0,target,08000930705,10,12hr,150p,150ppm,16,18,1st,2nd,...,ya,ye,yeah,year,yesterday,yet,yo,yr,yup,ìï
0,0.0,,,,,,,,,,...,,,,,,,,,,
1,0.0,,,,,,,,,,...,,,,,,,,,,
2,0.0,,,,,,,,,,...,,,,,,,,,,
3,0.0,,,,,,,,,,...,,,,,,,,,,
4,0.0,,,,,,,,,,...,,,,,,,,,,


In [119]:
df.isnull().sum()

target         4152
08000930705    4152
10             4152
12hr           4152
150p           4152
               ... 
yet            4152
yo             4152
yr             4152
yup            4152
ìï             4152
Length: 600, dtype: int64