In [1]:
import nltk
import numpy as np
import pandas as pd

In [2]:
# Load data in a dataframe
dt = pd.read_csv('SPAM-210331-134237.csv')

# Snapshot of the data - 10 items
dt.head(10)

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [3]:
# Normalization: Mapping 'spam' to 1 (int) and 'ham' to 0 (int)
dt['spam'] = dt['type'].map({'spam':1, 'ham':0}).astype(int)

dt.head() # Snapshot of updated dataframe

Unnamed: 0,type,text,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:
print('Columns in the given data:')
for col in dt.columns:
    print(col)

In [None]:
type_len = len(dt['type'])
print('Number of rows in the review column:', type_len)

text_len = len(dt['text'])
print('Number of rows in the liked column:', text_len)

## 2. Tokenization

In [None]:
dt['text'][1] # before

In [None]:
def tokenizer(text):
    return text.split()

In [None]:
dt['text'] = dt['text'].apply(tokenizer)

In [None]:
dt['text'][1] # after

## 3. Stemming

In [None]:
dt['text'][1] # before

In [None]:
from nltk.stem.snowball import SnowballStemmer
porter = SnowballStemmer('english', ignore_stopwords = False)

In [None]:
def stem_it(text):
    return [porter.stem(word) for word in text]

In [None]:
dt['text'] = dt['text'].apply(stem_it)

In [None]:
dt['text'][1] # after stemming

## 4. Lemmitization

In [None]:
dt['text'][92] # before

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
def lemmatize_it(text):
    return [lemmatizer.lemmatize(word, pos = 'a') for word in text]

In [None]:
nltk.download('wordnet')

In [None]:
dt['text'] = dt['text'].apply(lemmatize_it)

In [None]:
dt['text'][92] # after

## 5. Stopword Removal

In [None]:
dt['text'][34] # before

In [None]:
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [None]:
def stop_it(text):
    review = [word for word in text if not word in stop_words]
    return review

In [None]:
dt['text'] = dt['text'].apply(stop_it)

In [None]:
dt['text'][34] # after

In [None]:
dt.head(10)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
y = dt.spam.values
x = tfidf.fit_transform(dt['text'])

## 7. Classification using Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

from sklearn.metrics import accuracy_score
acc_log = accuracy_score(y_pred, y_test)*100
print("Accuracy:", acc_log)

## 8. Classification using LinearSVC Accuracy

In [None]:
from sklearn.svm import LinearSVC

linear_svc = LinearSVC(random_state = 0)
linear_svc.fit(x_train, y_train)
y_pred = linear_svc.predict(x_test)
acc_linear_svc = accuracy_score(y_pred, y_test)*100
print("Accuracy:", acc_linear_svc)