In [None]:
import numpy as np
import pandas as pd
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
nltk.download('stopwords')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data = pd.read_csv('/content/drive/My Drive/Datasets/SMS_Spam/spam.csv',encoding = "latin")

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data = data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)

In [None]:
data.head()

In [None]:
data = data.rename(columns= {'v1':'label','v2':'message'})
data.head()

In [None]:
data.groupby('label').describe()

In [None]:
data_copy = data['message'].copy()

In [None]:
def text_preprocess(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [word for word in text.split() if word.lower() not in stopwords.words('english')]
    return " ".join(text)

In [None]:
data_copy = data_copy.apply(text_preprocess)

In [None]:
data_copy

In [None]:
vectorizer = TfidfVectorizer("english")

In [None]:
features = vectorizer.fit_transform(data_copy)

In [None]:
feature_train, feature_test, label_train, label_test = train_test_split(features, data['label'], test_size=0.25, random_state=111)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score

In [None]:
lr_model = LogisticRegression(solver='liblinear', penalty='l1')
lr_model.fit(feature_train, label_train)
pred_lr = lr_model.predict(feature_test)
print("Accuracy Score for logistic regression",accuracy_score(label_test,pred_lr))

In [None]:
bnb_model = BernoulliNB()
bnb_model.fit(feature_train, label_train)
pred_bnb = bnb_model.predict(feature_test)
print("Accuracy Score for Bernoulli naive bayes",accuracy_score(label_test,pred_bnb))

Let's try using stemming

In [None]:
def stemmer (text):
    text = text.split()
    words = ""
    for i in text:
            stemmer = SnowballStemmer("english")
            words += (stemmer.stem(i))+" "
    return words

In [None]:
data_copy = data_copy.apply(stemmer)
vectorizer = TfidfVectorizer("english")
features = vectorizer.fit_transform(data_copy)

In [None]:
feature_train, feature_test, label_train, label_test = train_test_split(features, data['label'], test_size=0.25, random_state=111)

In [None]:
lr_model = LogisticRegression(solver='liblinear', penalty='l1')
lr_model.fit(feature_train, label_train)
pred_lr = lr_model.predict(feature_test)
print("Accuracy Score for logistic regression",accuracy_score(label_test,pred_lr))

In [None]:
bnb_model = BernoulliNB()
bnb_model.fit(feature_train, label_train)
pred_bnb = bnb_model.predict(feature_test)
print("Accuracy Score for Bernoulli naive bayes",accuracy_score(label_test,pred_bnb))

 Let's try normalizing length.

In [None]:
data['length'] = data['message'].apply(len)
data.head()

In [None]:
data_with_len = data['length'].to_numpy()
new_features = np.hstack((features.todense(),data_with_len[:, None]))

In [None]:
feature_train, feature_test, label_train, label_test = train_test_split(features, data['label'], test_size=0.25, random_state=111)

In [None]:
lr_model = LogisticRegression(solver='liblinear', penalty='l1')
lr_model.fit(feature_train, label_train)
pred_lr = lr_model.predict(feature_test)
print("Accuracy Score for logistic regression",accuracy_score(label_test,pred_lr))

In [None]:
bnb_model = BernoulliNB()
bnb_model.fit(feature_train, label_train)
pred_bnb = bnb_model.predict(feature_test)
print("Accuracy Score for Bernoulli naive bayes",accuracy_score(label_test,pred_bnb))