In [1]:
!pip install nltk



In [2]:
#importing necessary packages
import numpy as np
import pandas as pd

# Loading the DataSet

In [3]:
#UTF-8 is a multibyte encoding that can represent any Unicode character. ISO 8859-1 is a single-byte encoding that can 
#represent the first 256 Unicode characters. Both encode ASCII exactly the same way.
dt=pd.read_csv("spamx.csv",encoding='ISO-8859-1')
dt.head()

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
#adding a new column to the dataset,to identify which is spam or ham based on that number 
dt['spam']=dt['type'].map({'spam':1,'ham':0})
dt.head()

Unnamed: 0,type,text,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [5]:
len(dt['spam'])

5572

# Preprocessing Dataset

# Tokenization

In [6]:
dt['text'][1]

'Ok lar... Joking wif u oni...'

In [7]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saive\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
#tokenization is spilting of sentences into words to classify them easily
def tokenize_data(text):
    nltk_tokens=nltk.word_tokenize(text)
    return nltk_tokens

In [9]:
dt['text']=dt['text'].apply(tokenize_data)

In [10]:
dt['text'][1]

['Ok', 'lar', '...', 'Joking', 'wif', 'u', 'oni', '...']

# Stemming

In [11]:
from nltk.stem.snowball import SnowballStemmer
porter=SnowballStemmer("english")

In [12]:
#Stemming is used to remove the suffix and find the root word
#waiting,waits,waiter->wait
def stem_it(text):
    return [porter.stem(word) for word in text]

In [13]:
dt['text']=dt['text'].apply(stem_it)


In [14]:
dt['text'][1]

['ok', 'lar', '...', 'joke', 'wif', 'u', 'oni', '...']

# Lemmitization

In [15]:
dt['text'][152]

['ok', '...', 'ur', 'typic', 'repli', '...']

In [16]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [17]:
#lemmitization removes inflectional endings returning the base or dictionary form of a word
#is,are,was,an->be
def lemmatizing_it(text):
    return [lemmatizer.lemmatize(word) for word in text] 

In [18]:
dt['text']=dt['text'].apply(lemmatizing_it)

In [19]:
dt['text'][152]

['ok', '...', 'ur', 'typic', 'repli', '...']

# Stopwords Removal

In [20]:
dt['text'][217]

['easi', 'ah', '?', 'sen', 'got', 'select', 'mean', 'it', 'good', '..']

In [21]:
from nltk.corpus import stopwords
stop=stopwords.words('english')

In [22]:
#stopwords removal is used to remove unneccesary words and very little meaning words  from the list.
def stop_it(text):
    review=[word for word in text if not word in stop]
    return review

In [23]:
dt['text']=dt['text'].apply(stop_it)

In [24]:
dt['text'][217]

['easi', 'ah', '?', 'sen', 'got', 'select', 'mean', 'good', '..']

In [25]:
dt.head()

Unnamed: 0,type,text,spam
0,ham,"[go, jurong, point, ,, crazi, .., avail, onli,...",0
1,ham,"[ok, lar, ..., joke, wif, u, oni, ...]",0
2,spam,"[free, entri, 2, wkli, comp, win, fa, cup, fin...",1
3,ham,"[u, dun, say, earli, hor, ..., u, c, alreadi, ...",0
4,ham,"[nah, n't, think, goe, usf, ,, live, around, t...",0


In [26]:
dt['text']=dt['text'].apply(' '.join)

In [27]:
dt.head()

Unnamed: 0,type,text,spam
0,ham,"go jurong point , crazi .. avail onli bugi n g...",0
1,ham,ok lar ... joke wif u oni ...,0
2,spam,free entri 2 wkli comp win fa cup final tkts 2...,1
3,ham,u dun say earli hor ... u c alreadi say ...,0
4,ham,"nah n't think goe usf , live around though",0


# Transforming text data into TF-IDF vectors

In [None]:
#Vectorization is a technique used to convert textual data to numerical format.
#Using Vectorization,a matrix is created where each column represents a feature and each
#row represents an individual reviews.

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()
y=dt.spam.values
x=tfidf.fit_transform(dt['text'])

In [34]:
#Spliting the data into train and test set
from sklearn.model_selection import train_test_split
X_train,x_test,Y_train,y_test=train_test_split(x,y,random_state=1)

# Classification using Logistic Regression

In [35]:
from sklearn.linear_model import LogisticRegression
clf=LogisticRegression()
clf.fit(X_train,Y_train)
y_pred=clf.predict(x_test)
from sklearn.metrics import accuracy_score
acc=accuracy_score(y_pred,y_test)*100
print(acc)

96.4824120603015
