# Notebook for Pipeline Experiments

In [None]:
import numpy as np        # For numerical operations
import pandas as pd       # For data manipulation and analysis
import matplotlib.pyplot as plt  # For data visualization
%matplotlib inline

# Importing WordCloud for text visualization
from wordcloud import WordCloud

# Importing NLTK for natural language processing
import nltk
from nltk.corpus import stopwords    # For stopwords


# Downloading NLTK data
nltk.download('stopwords')   # Downloading stopwords data
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adityamishra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adityamishra/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
import string
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder  # For encoding categorical labels
# Importing the Porter Stemmer for text stemming
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score

### 1. Data Cleaning

In [3]:
df = pd.read_csv('spam.csv')   # Reading the dataset
df.head()   # Displaying the first few rows of the dataset

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df.rename(columns={'v1': 'target', 'v2': 'text'}, inplace=True)
df.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### 2. Data Preprocessing

In [8]:
encoder = LabelEncoder()# Encoding the target labels
df['target'] = encoder.fit_transform(df['target'])# Transforming 'ham' to 0 and 'spam' to 1
df.head()# Displaying the first few rows after encoding

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
df.duplicated().sum()   # Checking for duplicate rows

np.int64(403)

In [10]:
len(df)# Length of the dataframe before removing duplicates

5572

In [12]:
df = df.drop_duplicates(keep='first')# Removing duplicate rows and keeping the first occurrence
len(df)# Length of the dataframe after removing duplicates

5169

### 3. Feature Engineering

In [13]:
ps = PorterStemmer()   # Initializing the Porter Stemmer for stemming i.e. reducing words to their root form

In [14]:
def transform_text(text):
    text = text.lower()   # Converting text to lowercase
    text = nltk.word_tokenize(text)   # Tokenizing the text into words
    
    y = []# List to hold processed words
    for i in text:
        if i.isalnum():   # Checking if the token is alphanumeric
            y.append(i)# Appending alphanumeric tokens to the list
    
    text = y[:]# Copying the list to text
    y.clear()# Clearing the list for reuse
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)# Removing stopwords and punctuation
    
    text = y[:]# Copying the list to text
    y.clear()# Clearing the list for reuse
    for i in text:
        y.append(ps.stem(i))# Stemming the words to their root form
        
    return " ".join(y)   # Joining the processed words back into a single string

In [18]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/adityamishra/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [19]:
transform_text('Go until jurong point, crazy')

'go jurong point crazi'

In [20]:
df['transformed_text'] = df['text'].apply(transform_text)
df.head()

Unnamed: 0,target,text,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [21]:
tfid = TfidfVectorizer(max_features=500)# Initializing the TF-IDF Vectorizer with a maximum of 500 features for text representation
X = tfid.fit_transform(df['transformed_text']).toarray()
y = df['target'].values

### 4. Train-Test-Split

In [22]:
X_train, X_test , y_train, y_test = train_test_split(X, y,test_size = 0.20, random_state = 2)

In [24]:
svc = SVC(kernel="sigmoid", gamma=1.0)# Initializing the Support Vector Classifier with sigmoid kernel
knc = KNeighborsClassifier()# Initializing the K-Nearest Neighbors Classifier
mnb = MultinomialNB()# Initializing the Multinomial Naive Bayes Classifier
dtc = DecisionTreeClassifier(max_depth=5)# Initializing the Decision Tree Classifier with a maximum depth of 5
lrc = LogisticRegression(solver='liblinear', penalty='l1')# Initializing the Logistic Regression Classifier with L1 penalty
rfc = RandomForestClassifier(n_estimators=50, random_state=2)# Initializing the Random Forest Classifier with 50 estimators
abc = AdaBoostClassifier(n_estimators=50, random_state=2)# Initializing the AdaBoost Classifier with 50 estimators
bc = BaggingClassifier(n_estimators=50, random_state=2)# Initializing the Bagging Classifier with 50 estimators
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)# Initializing the Extra Trees Classifier with 50 estimators
gbdt = GradientBoostingClassifier(n_estimators=50, random_state=2)# Initializing the Gradient Boosting Classifier with 50 estimators
xgb = XGBClassifier(n_estimators=50, random_state=2)# Initializing the XGBoost Classifier with 50 estimators

In [None]:
clfs = {
    'SVC': svc,
    'KNN': knc,
    'NB': mnb,
    'DT': dtc,
    'LR': lrc,
    'RF': rfc,
    'Adaboost': abc,
    'Bgc': bc,
    'ETC': etc,
    'GBDT': gbdt,
    'xgb': xgb

}# Dictionary to hold all classifiers

### 5. Model Evaluation

In [26]:
def train_classifier(clfs, X_train, y_train, X_test, y_test):
    clfs.fit(X_train, y_train)
    y_pred = clfs.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    return accuracy, precision

In [27]:
accuracy_scores = []
precision_scores = []
for name, clfs in clfs.items():
    current_accuracy, current_precision = train_classifier(
        clfs, X_train, y_train, X_test, y_test)
    print()
    print("For: ", name)
    print("Accuracy: ", current_accuracy)
    print("Precision: ", current_precision)

    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)


For:  SVC
Accuracy:  0.9661508704061895
Precision:  0.9327731092436975

For:  KNN
Accuracy:  0.9274661508704062
Precision:  1.0

For:  NB
Accuracy:  0.9709864603481625
Precision:  0.9655172413793104

For:  DT
Accuracy:  0.9361702127659575
Precision:  0.9

For:  LR
Accuracy:  0.9622823984526112
Precision:  0.9541284403669725

For:  RF
Accuracy:  0.971953578336557
Precision:  0.943089430894309

For:  Adaboost
Accuracy:  0.9235976789168279
Precision:  0.8734177215189873

For:  Bgc
Accuracy:  0.965183752417795
Precision:  0.9180327868852459

For:  ETC
Accuracy:  0.9729206963249516
Precision:  0.9296875

For:  GBDT
Accuracy:  0.9506769825918762
Precision:  0.9393939393939394

For:  xgb
Accuracy:  0.9700193423597679
Precision:  0.9572649572649573
