In [3]:
import pandas as pd
# pandas for data operations/manipulation
import nltk
# nltk is a NLP toolkit used for almost everything in NLP tasks
from nltk.stem import PorterStemmer
import string
# here we make use of string to get punctuation list

# The very first step is reading our data from csv file and making a dataframe of it.

In [4]:
df = pd.read_csv("C:/Users/xbhi0/OneDrive/Desktop/NLP/spam.csv",encoding="ISO-8859-1")
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


# In the above dataframe we have few unnecessary columns ('unnamed:2','unnamed:3','unnamed:4') which are no use to us for the classification task.
so we will drop this columns to remove uncessary data

In [5]:
df = df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'])
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
new_df = df['v2']
new_df

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object

From here we will start the data preprocessing steps, beginning with tokenization. Here, we use word_tokenize from nltk to tokenize words in our data.

In [7]:
# tokens = nltk.sent_tokenize(new_df)
# tokens

def tokenize(text):
    text = ' '.join([i for i in nltk.word_tokenize(text)])
    return text

new_df = new_df.apply(tokenize)
new_df
    

0       Go until jurong point , crazy .. Available onl...
1                         Ok lar ... Joking wif u oni ...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor ... U c already then sa...
4       Nah I do n't think he goes to usf , he lives a...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568               Will Ì_ b going to esplanade fr home ?
5569    Pity , * was in mood for that . So ... any oth...
5570    The guy did some bitching but I acted like i '...
5571                          Rofl . Its true to its name
Name: v2, Length: 5572, dtype: object

The next step of preprocessing is STEMMING. STEMMING is the process of converting a word to its base form.

In [8]:
def stemmer(text):
    ps = PorterStemmer()
    text = ' '.join([ps.stem(i) for i in text.split()])
    return text

new_df = new_df.apply(stemmer)
new_df

0       go until jurong point , crazi .. avail onli in...
1                           ok lar ... joke wif u oni ...
2       free entri in 2 a wkli comp to win fa cup fina...
3       u dun say so earli hor ... u c alreadi then sa...
4       nah i do n't think he goe to usf , he live aro...
                              ...                        
5567    thi is the 2nd time we have tri 2 contact u. u...
5568                   will ì_ b go to esplanad fr home ?
5569    piti , * wa in mood for that . so ... ani othe...
5570    the guy did some bitch but i act like i 'd be ...
5571                            rofl . it true to it name
Name: v2, Length: 5572, dtype: object

Now, we created a list of punctuations with the help of string library. so that we can remove punctuations from our data(text).

In [9]:
punch = string.punctuation
punch

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

By following this method we have performed the above task of removing punctuations.

In [None]:
punch = string.punctuation

def remove_punch(text):
    text=text.split()
    res=[]
    for word in text:
        temp=''
        for alpha in word:
            if alpha not in punch:
                temp+=alpha
        res.append(temp)
    res=' '.join(res)
    return res

new_df_pf = new_df.apply(remove_punctuation)
new_df_pf

0       go until jurong point , crazi .. avail onli in...
1                           ok lar ... joke wif u oni ...
2       free entri in 2 a wkli comp to win fa cup fina...
3       u dun say so earli hor ... u c alreadi then sa...
4       nah i do n't think he goe to usf , he live aro...
                              ...                        
5567    thi is the 2nd time we have tri 2 contact u. u...
5568                   will ì_ b go to esplanad fr home ?
5569    piti , * wa in mood for that . so ... ani othe...
5570    the guy did some bitch but i act like i 'd be ...
5571                            rofl . it true to it name
Name: v2, Length: 5572, dtype: object

Stopwords consume a huge part of data and doesn't have much significance in the classification task, so we have to remove stopwords also to make our data as precise as possible. we perform this to improve the accuract(performance) of our prediction model.

In [12]:
#nltk.download('stopwords')
from nltk.corpus import stopwords

In [13]:
stop_words = set(stopwords.words('english'))
#stop_words

def stop_words_free(text):
    stopwordsfree = ' '.join([i.strip() for i in new_df_pf if i not in stop_words])
    return text

new_df_pf_sf = new_df_pf.apply(stop_words_free)
new_df_pf_sf

0       go until jurong point , crazi .. avail onli in...
1                           ok lar ... joke wif u oni ...
2       free entri in 2 a wkli comp to win fa cup fina...
3       u dun say so earli hor ... u c alreadi then sa...
4       nah i do n't think he goe to usf , he live aro...
                              ...                        
5567    thi is the 2nd time we have tri 2 contact u. u...
5568                   will ì_ b go to esplanad fr home ?
5569    piti , * wa in mood for that . so ... ani othe...
5570    the guy did some bitch but i act like i 'd be ...
5571                            rofl . it true to it name
Name: v2, Length: 5572, dtype: object

# Here preprocessing steps get completed. although there are many other task in preprocessing but in this case only the above implemented techniques are required.

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))

tf_data=tf.fit_transform(new_df_pf_sf)
print('Tfidf_test:',tf_data.shape)

Tfidf_test: (5572, 79004)


To train our model we have to split our data into training data and testing data. for this we take help of sklearn library which have train_test_split function which makes our task easy.

In [16]:
x = tf_data
y = df['v1']

from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

# now we perform model training

# 1. LOGISTIC REGRESSION

In [17]:
from sklearn.linear_model import LogisticRegression

obj = LogisticRegression()
obj.fit(X=x_train,y=y_train)
lr_result_predict = obj.predict(x_test)
print(lr_result_predict)

['ham' 'ham' 'ham' ... 'ham' 'ham' 'ham']


In [19]:
from sklearn.metrics import accuracy_score,classification_report
print(classification_report(y_test,lr_result_predict))

              precision    recall  f1-score   support

         ham       0.87      1.00      0.93       965
        spam       0.00      0.00      0.00       150

    accuracy                           0.87      1115
   macro avg       0.43      0.50      0.46      1115
weighted avg       0.75      0.87      0.80      1115



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# 2. NAIVE BAYES

In [20]:
from sklearn.naive_bayes import MultinomialNB

nb=MultinomialNB()
nb.fit(x_train,y_train)
nb_predict = nb.predict(x_test)
print(nb_predict)

['ham' 'ham' 'ham' ... 'ham' 'ham' 'ham']


In [21]:
print(classification_report(y_test,nb_predict))

              precision    recall  f1-score   support

         ham       0.87      1.00      0.93       965
        spam       0.00      0.00      0.00       150

    accuracy                           0.86      1115
   macro avg       0.43      0.50      0.46      1115
weighted avg       0.75      0.86      0.80      1115

