<a href="https://colab.research.google.com/github/adityapatil4141/Sms-Spam-Classifier/blob/main/Sms_Spam_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importing necessary libraries
import pandas as pd # Data manipulation.
import numpy as np # Mathematical operations.
import matplotlib.pyplot as plt # Visualization
import seaborn as sns # Visualization

In [2]:
# Checking scikit-learn(sklearn)'s version.
# We need to check for 'inconsistentversion' warning.
# In later steps when building website sklearn versions on colab and streamlit should be same.
!pip show scikit-learn

Name: scikit-learn
Version: 1.2.2
Summary: A set of python modules for machine learning and data mining
Home-page: http://scikit-learn.org
Author: 
Author-email: 
License: new BSD
Location: /usr/local/lib/python3.10/dist-packages
Requires: joblib, numpy, scipy, threadpoolctl
Required-by: bigframes, fastai, imbalanced-learn, librosa, mlxtend, qudida, sklearn-pandas, yellowbrick


In [None]:
# Mounting google drive.
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Reading DataFrame with encoding as "ISO-8859-1".
df = pd.read_csv("/content/drive/MyDrive/work files /sms spam classifier/spam.csv",encoding = "ISO-8859-1")

In [None]:
# Printing first 3 rows of dataset.
df.head(3)

In [None]:
# Checking for shape of dataset.
df.shape

In [None]:
# Counting corresponding values of categories.
df['v1'].value_counts()

#Data Cleaning

In [None]:
# Getting dataframe information.
df.info()

In [None]:
# Dropping unnecessary columns form the dataframe.
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

In [None]:
# Renaming remaining columns for easy understanding.
df.rename(columns={'v1':'target','v2':'text'},inplace=True)

In [None]:
# Printing first 3 rows of dataframe.
df.head(3)

In [None]:
# Applying LabelEncoder.
# Label Encoder converts non-numerical values to numeriacl values.
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df['target'] =le.fit_transform(df['target'])


In [None]:
# Printing first 3 rows of dataset.
df.head(3)

In [None]:
# Finding null values from the dataset.
df.isnull().sum()

In [None]:
# Checking for duplicates from the dataset.
df.duplicated().sum()

In [None]:
# Dropping duplicate values for the dataset,
# (keep = ‘first’) : Drop duplicates except for the first occurrence.
df = df.drop_duplicates(keep='first')

In [None]:
# Checking for duplicates from the dataset.
df.duplicated().sum()

In [None]:
# Checking dataframe shape.
df.shape

# EDA

In [None]:
# Plotting a pie chart to check which target category has highest percentage of value counts.
plt.pie(df['target'].value_counts(),labels=['hams','spams'],autopct='%0.2f')
plt.show()

# data is imblanced

In [None]:
# Importing nltk.
import nltk

In [None]:
# Punkt - divides a text into a list of sentences.
nltk.download('punkt')

In [None]:
# Finding number of characters in text column.
df['num_characters'] = df['text'].apply(len)

In [None]:
#num of word

df['num_words'] = df['text'].apply(lambda x:len(nltk.word_tokenize(x)))

In [None]:
# number of sentences:

df['num_sentences'] = df['text'].apply(lambda x: len(nltk.sent_tokenize(x)))

In [None]:
# Printing first 3 rows of dataset.
df.head(3)

In [None]:
# Describing specific columns.
df[['num_characters','num_words','num_sentences']].describe()

In [None]:
# Describing specific columns with target==0(non-spam messages/texts).
df[df['target']==0][['num_characters','num_words','num_sentences']].describe()

In [None]:
# Describing specific columns with target==0(spam messages/texts).
df[df['target']==1][['num_characters','num_words','num_sentences']].describe()

In [None]:
# we can clearly see that spam messages average character length is bigger than ham.

In [None]:
plt.figure(figsize=(12,5))
sns.histplot(df[df['target']==0]['num_characters'])
sns.histplot(df[df['target']==1]['num_characters'],color='red')

In [None]:
plt.figure(figsize=(12,5))
sns.histplot(df[df['target']==0]['num_words'])
sns.histplot(df[df['target']==1]['num_words'],color='red')

In [None]:
sns.heatmap(df.corr(),annot=True)

#Data Preprocessing

In [None]:
# stopwords - words which adds no meaning to the sentence (eg - is, are, to, as, etc).
nltk.download('stopwords')

In [None]:
# Importing stop words.
from nltk.corpus import stopwords
import string

In [None]:
# Importing PoterStemmer.
# remove the suffixes from an English word and obtain its stem
#Some more example of stemming for root word "like" include:
# "likes"
# "liked"
# "likely"
# "liking"

from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()


In [None]:
# Creating a dinction function which will do following things:
# 1. Convert text into lower format.
# 2. Only taking alphabet-numerical (alnum) and creating new list from it.
# 3. Removing any stopwords or puncations from the values from the list.
# 4. Applying stemming.
# 5. Join () - takes all the elements of an iterable and joins them into a single string.
def text_transformer(text):
  text = text.lower()
  text = nltk.word_tokenize(text)

  y = []
  for i in text:
    if i.isalnum():# alnum = alpha-numeric.
      y.append(i)


  text = y[:] #asssigning y to 'text' .... "[:]"" we had to do it because we cannot copy list directly we have to clone it.
  y.clear() #clearing y after assigning to text

  for i in text:
    if i not in stopwords.words('english') and i not in string.punctuation: # will check word to words(i) and see if stopword == word(i)
      y.append(i)


  text = y[:]
  y.clear()

  for i in text :
    y.append(ps.stem(i))


  return " ".join(y)




In [None]:
# Applying function on the dataframe.
df['transformed_text'] = df['text'].apply(text_transformer)

In [None]:
# Printing first 3 rows of the dataframe.
df.head(3)

In [None]:
# WorldCloud - a data visualization technique used for representing text data in which the size of each word indicates its frequency or importance

In [None]:
from wordcloud import WordCloud
wc = WordCloud(width=500, height=500,min_font_size=10,background_color='white')

In [None]:
span_wc =wc.generate(df[df['target']==1]['transformed_text'].str.cat(sep="  "))

In [None]:
plt.imshow(span_wc)

In [None]:
ham_wc = wc.generate(df[df['target']==0]['transformed_text'].str.cat(sep=" ")) # getting str and concatenate on space to 'transformed_text'

In [None]:
plt.imshow(ham_wc)

In [None]:
# getting most used word in spam:
spam_corpus=[]
for msg in df[df['target']==1]['transformed_text'].tolist(): #will get list of strings
  for i in msg.split(): #iterating through every list(msg) and every word(i)
    spam_corpus.append(i)


In [None]:
len(spam_corpus)

In [None]:
from collections import Counter #it will create a dictionary with count of occurance of each word
# plotting barplot of 30 most common values.
sns.barplot(x= pd.DataFrame(Counter(spam_corpus).most_common(30))[0],y =pd.DataFrame(Counter(spam_corpus).most_common(30))[1])
plt.xticks(rotation='vertical')
plt.show()



In [None]:
ham_corpus = []
for msg in df[df['target']==0]['transformed_text'].tolist():
  for i in msg.split():
    ham_corpus.append(i)

In [None]:
ham_counter = Counter(ham_corpus).most_common(30)

In [None]:
sns.barplot(x=pd.DataFrame(ham_counter)[0],y=pd.DataFrame(ham_counter)[1])
plt.xticks(rotation='vertical')
plt.show()

#Model Building

In [None]:
# We know that naiveBayes algorithm works best on textual data:
# NaiveBayes need numerical data,
# We have to convert text to numerical data/vectors,
# Bagsofword(frequent word) ,tfidf

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
tfidf = TfidfVectorizer(max_features=3000)
cv = CountVectorizer()

In [None]:
x = tfidf.fit_transform(df['transformed_text']).toarray()

In [None]:
x.shape

In [None]:
y = df['target'].values

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=2)

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix,precision_score


In [None]:
gnb =GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [None]:
gnb.fit(x_train,y_train)
y_pred1 = gnb.predict(x_test)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))
print(precision_score(y_test,y_pred1 ))

In [None]:
mnb.fit(x_train,y_train)
y_pred2 = mnb.predict(x_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))

In [None]:
#till noe bernoulli is performing well

bnb.fit(x_train,y_train)
y_pred3 = bnb.predict(x_test)
print(accuracy_score(y_test,y_pred3))
print(confusion_matrix(y_test,y_pred3))
print(precision_score(y_test,y_pred3 ))

WITH TFIDF VECTORIZER:


In [None]:
# WITH TFIDF VECTORIZER:

x = tfidf.fit_transform(df['transformed_text']).toarray()

In [None]:
y = df['target'].values

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=2)

In [None]:
y_test

In [None]:
gnb.fit(x_train,y_train)
y_pred4 = gnb.predict(x_test)
print(accuracy_score(y_test,y_pred4))
print(confusion_matrix(y_test,y_pred4))
print(precision_score(y_test,y_pred4))

In [None]:
mnb.fit(x_train,y_train)
y_pred5 = mnb.predict(x_test)
print(accuracy_score(y_test,y_pred5))
print(confusion_matrix(y_test,y_pred5))
print(precision_score(y_test,y_pred5))

In [None]:
bnb.fit(x_train,y_train)
y_pred6 = bnb.predict(x_test)
print(accuracy_score(y_test,y_pred6))
print(confusion_matrix(y_test,y_pred6))
print(precision_score(y_test,y_pred6))

In [None]:
# Here we choose TfidfVectorizer --> MultinomialNB

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [None]:
svc = SVC(kernel = 'sigmoid',gamma=1.0) #Support Vector Classification
mlb = MultinomialNB()
knc = KNeighborsClassifier()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver = 'liblinear',penalty='l1')
rfc = RandomForestClassifier(n_estimators = 50, random_state = 2)
abc = AdaBoostClassifier(n_estimators = 50, random_state = 2)
bc = BaggingClassifier(n_estimators = 50, random_state = 2)
etc = ExtraTreesClassifier(n_estimators = 50 , random_state = 2)
gbc = GradientBoostingClassifier(n_estimators = 50, random_state = 2)
xgc = XGBClassifier(n_estimators = 50 , random_State = 2)


In [None]:
from nltk.corpus.reader import et
clfs = {
    'svc' : svc,
    'Nb' : mlb,
    'knc' : knc,
    'dtc' : dtc,
    'lrc' : lrc,
    'rfc' : rfc,
    'abc' : abc,
    'bc' : bc,
    'etc' : etc,
    'gbc' : gbc,
    'xgc' : xgc
}

In [None]:
def train_classifier(clf, x_train,y_train,x_test,y_test):
  clf.fit(x_train,y_train)
  clf.predict(x_test)
  y_pred  = clf.predict(x_test)
  accuracy = accuracy_score(y_test,y_pred)
  precision = precision_score(y_test,y_pred)

  return accuracy , precision

In [None]:
accuracy_scores = []
precision_scores = []

for name, clf in clfs.items():
  current_accuracy,current_precision = train_classifier(clf,x_train,y_train,x_test,y_test)
  print('for', name)
  print('accuracy = ',current_accuracy)
  print('precision = ',current_precision)

  accuracy_scores.append(current_accuracy)
  precision_scores.append(current_precision)







In [None]:
performance_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_values('Precision',ascending = False)

In [None]:
performance_df

In [None]:
performance_df1 = pd.melt(performance_df,id_vars='Algorithm')


In [None]:
performance_df1

In [None]:
sns.catplot(x = 'Algorithm', y = 'value', hue='variable',data=performance_df1, kind='bar',height=5)
plt.ylim(0.5,1.0)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
import pickle
pickle.dump(tfidf,open('vectorizer2.pkl','wb'))
pickle.dump(mnb,open('model2.pkl','wb'))