In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("../Data/spam.csv")
df.head()



In [3]:
df.shape



In [4]:
## Data Cleaning
## EDA
## Text Preprocessing

## 1. Data Cleaning

In [5]:
df.info()



In [6]:
df.drop(columns=['Unnamed: 2','Unnamed: 3', 'Unnamed: 4'], inplace=True)

In [7]:
df.head()



In [8]:
df.isnull().sum()



In [9]:
# drop last 3 columns
df.rename(columns={"v1": "target", "v2": "text"}, inplace=True)

In [10]:
df.head()



In [11]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [12]:
df['target'] = encoder.fit_transform(df['target'])

In [13]:
df.head()



In [14]:
# Check for duplicate values
df.duplicated().sum()



In [15]:
# remove duplicate
df = df.drop_duplicates(keep='first')

In [16]:
df.duplicated().sum()



In [17]:
df.shape



In [18]:
df.head()



## 2. EDA

In [19]:
df['target'].value_counts()



In [20]:
import matplotlib.pyplot as plt
plt.pie(df['target'].value_counts(),labels=["ham", "spam"], autopct="%0.2f")
plt.show()



*Data is Imbalanced*

In [21]:
import nltk

In [22]:
nltk.download('punkt')





In [23]:
df["num_char"] = df["text"].apply(len)

In [24]:
df.head()



In [25]:
## num of words
df["num_words"] = df["text"].apply(lambda x: len(nltk.word_tokenize(x)))

In [26]:
df.head()



In [27]:
df["num_sentences"] = df["text"].apply(lambda x: len(nltk.sent_tokenize(x)))

In [28]:
df.head()



In [29]:
df.describe()



In [30]:
# ham
df[df['target'] == 0].describe()



In [31]:
#spam
df[df['target'] == 1].describe()



In [32]:
import seaborn as sns

In [33]:
plt.figure(figsize=(20,10))
sns.histplot(df[df['target'] == 0]['num_char'])
sns.histplot(df[df['target'] == 1]['num_char'], color='red')





In [34]:
plt.figure(figsize=(20,10))
sns.histplot(df[df['target'] == 0]['num_words'])
sns.histplot(df[df['target'] == 1]['num_words'], color='red')





In [35]:

sns.pairplot(data=df, hue='target')
plt.show()



In [36]:
sns.heatmap(df.corr(numeric_only=float), annot=True)





## 3. Text Proprocessing

- Lower case
- Tokenization
- Removing special characters
- Removing stop words and punctuation
- Stemming

In [37]:
import regex as re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [38]:
df.head()



In [38]:
ss = SnowballStemmer("english")

In [40]:
def text_transformation(text):
    text_lowr = text.lower()
    alnum_text = re.sub('[^a-z0-9]', ' ', text_lowr)
    alnum_text = alnum_text.split()
    msg_ss = " ".join(ss.stem(word) for word in alnum_text if word not in stopwords.words('english'))
    return msg_ss

In [41]:
text_transformation(df['text'][100])



In [42]:
df["transformed_text"] = df['text'].apply(text_transformation)

In [43]:
df.head()



## 4. Model Building

In [49]:
from sklearn.model_selection import train_test_split

In [50]:
x = df["transformed_text"]
y = df['target']

In [51]:
x



In [52]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.20, random_state=42)

In [48]:
x_train.shape, x_test.shape



In [49]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv = CountVectorizer()
tf = TfidfVectorizer()

In [50]:
x_train



In [51]:
x_train_cv = cv.fit_transform(x_train).toarray()
x_train_tf = tf.fit_transform(x_train).toarray()

In [52]:
x_test_cv = cv.transform(x_test).toarray()
x_test_tf = tf.transform(x_test).toarray()

In [53]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score

In [54]:
gnb_cv = GaussianNB()
gnb_tf = GaussianNB()
mnb_cv = MultinomialNB()
mnb_tf = MultinomialNB()
bnb_cv = BernoulliNB()
bnb_tf = BernoulliNB()

In [55]:
gnb_cv.fit(x_train_cv, y_train)
y_pred_gnb = gnb_cv.predict(x_test_cv)
print(accuracy_score(y_test, y_pred_gnb))
print(confusion_matrix(y_test, y_pred_gnb))
print(precision_score(y_test, y_pred_gnb))



In [56]:
gnb_tf.fit(x_train_tf, y_train)
y_pred_gnb = gnb_tf.predict(x_test_tf)
print(accuracy_score(y_test, y_pred_gnb))
print(confusion_matrix(y_test, y_pred_gnb))
print(precision_score(y_test, y_pred_gnb))



In [57]:
mnb_cv.fit(x_train_cv, y_train)
y_pred_mnb = mnb_cv.predict(x_test_cv)
print(accuracy_score(y_test, y_pred_mnb))
print(confusion_matrix(y_test, y_pred_mnb))
print(precision_score(y_test, y_pred_mnb))



In [58]:
mnb_tf.fit(x_train_tf, y_train)
y_pred_bnb = mnb_tf.predict(x_test_tf)
print(accuracy_score(y_test, y_pred_mnb))
print(confusion_matrix(y_test, y_pred_mnb))
print(precision_score(y_test, y_pred_mnb))



In [59]:
bnb_cv.fit(x_train_cv, y_train)
y_pred_bnb = bnb_cv.predict(x_test_cv)
print(accuracy_score(y_test, y_pred_bnb))
print(confusion_matrix(y_test, y_pred_bnb))
print(precision_score(y_test, y_pred_bnb))



In [60]:
bnb_tf.fit(x_train_tf, y_train)
y_pred_bnb = bnb_tf.predict(x_test_tf)
print(accuracy_score(y_test, y_pred_bnb))
print(confusion_matrix(y_test, y_pred_bnb))
print(precision_score(y_test, y_pred_bnb))



In [61]:
def model_build(x_train, x_test, y_train, y_test):
    models = {
        "models" : [ GaussianNB(), MultinomialNB(), BernoulliNB()],
        "vectors_alg" : [CountVectorizer(), TfidfVectorizer()] 
        }
    
    for model in models["models"]:
        for vect in models["vectors_alg"]:
            vectr = vect

            # vectorizing text
            x_train_vectr = vectr.fit_transform(x_train).toarray()
            x_test_vect = vectr.transform(x_test).toarray()

            #model training
            model_train = model.fit(x_train_vectr, y_train)
            y_pred = model_train.predict(x_test_vect)

            # 
            accuracy = accuracy_score(y_test, y_pred)
            confusion_matrx = confusion_matrix(y_test, y_pred)
            precision_scr = precision_score(y_test, y_pred)


            print("""
                    'model' :               {},
                    'vector_algo':          {},
                    'accuracy' :            {},
                    'confusion_matrix' :    {},
                    'precision_score' :     {}
                """.format( (str(model)[:-2]), str(vect)[:-2], accuracy, confusion_matrx, precision_scr))






In [62]:
model_build(x_train, x_test, y_train, y_test)



In [63]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score

In [64]:
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [65]:
clfs = {
    'gnb': gnb,
    'mnb': mnb,
    'bnb': bnb
}

In [66]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv = CountVectorizer(max_features=5000, ngram_range=(2,3))
tf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))

In [67]:
vectrs = {
    'cv': cv,
    'tfidf': tf
}

In [68]:
sd = pd.concat([x_train, x_test])
sd.shape



In [69]:
def train_model(clf, vectr, x_train, x_test, y_train, y_test):
    
    # vectorization
    x_train_vect = vectr.fit_transform(x_train).toarray()
    x_test_vect = vectr.transform(x_test).toarray()

    clf.fit(x_train_vect, y_train)
    y_pred = clf.predict(x_test_vect)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)

    print(confusion_matrix(y_test, y_pred))

    return accuracy, precision

In [70]:
train_model(mnb, tf, x_train, x_test, y_train, y_test)





In [71]:
algorithms = []
vectorizer_alg = []
accuracy_scores = []
precision_scores = []

for name, clf in clfs.items():
    for vect_name,vector in vectrs.items():

        current_accuracy, current_precision = train_model(clf, vector, x_train, x_test, y_train, y_test)

        print("Alg - ", name)
        print("Vector - ", vect_name)
        print("Accuracy - ", current_accuracy)
        print("Precision - ", current_precision)

        algorithms.append(name)
        vectorizer_alg.append(vect_name)
        accuracy_scores.append(current_accuracy)
        precision_scores.append(current_precision)



In [72]:
performance_df = pd.DataFrame({'Algorithms' : algorithms, 'Vectorization_technique': vectorizer_alg, 'Accuracy': accuracy_scores, 'Precision' : precision_scores})
performance_df



In [73]:
## we can use bnb and tfidf

In [74]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier

In [75]:
lrc = LogisticRegression()
svc = SVC()
bnb = BernoulliNB()
dtc = DecisionTreeClassifier(max_depth=5)
knc = KNeighborsClassifier(n_neighbors=2)
rfc = RandomForestClassifier(n_estimators=50)
abc = AdaBoostClassifier(n_estimators=50)
bc = BaggingClassifier(n_estimators=50)
etc = ExtraTreesClassifier(n_estimators=20, random_state=2)
gbdt = GradientBoostingClassifier(n_estimators=50)


In [47]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv = CountVectorizer(max_features=5000, ngram_range=(1,2))
tf = TfidfVectorizer(max_features=5000)

In [77]:
clfs = {
    # 'LR': lrc,
    'SVC': svc,
    'NB': bnb,
    # 'DT' : dtc,
    'KNC' : knc,
    # 'RF': rfc,
    # 'Ada' : abc,
    # 'BC' : bc,
    'ETC' : etc,
    # 'GBDT' : gbdt
    }

In [78]:
vectrs = {
    # 'cv': cv,
    'tfidf': tf
}

In [79]:
x_train.shape



In [80]:
x_train_vect = tf.fit_transform(x_train).toarray()

In [81]:
bnb.fit(x_train_vect, y_train)



In [82]:
pred = bnb.predict(x_test_vect)
print(pred)
accuracy_score(y_test, y_pred)



In [None]:
svc.fit(x_train_vect, y_train)



In [None]:
x_test_vect = tf.transform(x_test).toarray()

In [None]:
pred = bnb.predict(x_test_vect)
accuracy_score(y_test, pred)
precision_score(y_test, pred)



In [None]:
y_pred = svc.predict(x_test_vect)
accuracy_score(y_test, y_pred)
precision_score(y_test, y_test)



In [None]:
def train_model_1(clf, vectr, x_train, x_test, y_train, y_test):
    
    # vectorization
    x_train_vect = vectr.fit_transform(x_train).toarray()
    x_test_vect = vectr.transform(x_test).toarray()

    clf.fit(x_train_vect, y_train)
    y_pred = clf.predict(x_test_vect)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)

    print(confusion_matrix(y_test, y_pred))

    return accuracy, precision

In [None]:
algorithms = []
vectorizer_alg = []
accuracy_scores = []
precision_scores = []

for name, clf in clfs.items():
    for vect_name,vector in vectrs.items():

        current_accuracy, current_precision = train_model_1(clf, vector, x_train, x_test, y_train, y_test)

        print("Alg - ", name)
        print("Vector - ", vect_name)
        print("Accuracy - ", current_accuracy)
        print("Precision - ", current_precision)

        algorithms.append(name)
        vectorizer_alg.append(vect_name)
        accuracy_scores.append(current_accuracy)
        precision_scores.append(current_precision)



In [None]:
pred_df = pd.DataFrame({'Algorithms' : algorithms, 'Vectorization_technique': vectorizer_alg, 'Accuracy': accuracy_scores, 'Precision' : precision_scores})

In [None]:
pred_df.sort_values(by = ['Precision'], ascending=False)



# svc with tfidg

In [None]:
import pickle

In [None]:
pickle.dump(tf, open('../Models/vectorizer.pkl', 'wb'))
pickle.dump(svc, open('../Models/model.pkl', 'wb'))

In [None]:
pickle.dump(bnb, open('../Models/bnbmodel.pkl', 'wb'))

In [53]:
def text_transformation(text):
    text_lowr = text.lower()
    # alnum_text = re.sub('[^a-z0-9]', ' ', text_lowr)
    alnum_text = text_lowr.split()
    msg_ss = " ".join(ss.stem(word) for word in alnum_text if word not in stopwords.words('english'))
    return msg_ss

In [54]:
df['transformed_text'] = df['text'].apply(text_transformation)
df.head()



In [57]:
x = df["transformed_text"]
y = df['target']

In [58]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.30, random_state=42)

In [59]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape



In [60]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score

In [75]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv = CountVectorizer(max_features=5000, ngram_range=(2,3))
tf = TfidfVectorizer(ngram_range=(1,2))

In [76]:
x_train_tf = tf.fit_transform(x_train).toarray()
x_test_tf = tf.transform(x_test).toarray()

In [77]:
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB(alpha=2)
svc = SVC(random_state=2)

In [78]:
gnb.fit(x_train_tf, y_train)
y_pred = gnb.predict(x_test_tf)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(precision_score(y_test, y_pred))



In [79]:
bnb.fit(x_train_tf, y_train)
y_pred = bnb.predict(x_test_tf)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(precision_score(y_test, y_pred))





In [80]:
mnb.fit(x_train_tf, y_train)
y_pred = mnb.predict(x_test_tf)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(precision_score(y_test, y_pred))



In [81]:
svc.fit(x_train_tf, y_train)
y_pred = svc.predict(x_test_tf)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(precision_score(y_test, y_pred))

