In [1]:
import nltk
import pandas as pd
import matplotlib.pyplot as plt



In [2]:
data = pd.read_csv('D:\ML_workspace\interview_nlp\Datasets\smsspamcollection\SMSSpamCollection',sep='\t',
                           names=["label", "message"])

In [3]:
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
data['message'].shape

(5572,)

In [5]:
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords

In [6]:
data.message

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: message, Length: 5572, dtype: object

In [7]:
len(data)

5572

In [8]:
import re
import numpy as np

lemma = WordNetLemmatizer()
corpus = []
for i in range(len(data)):
    word_process = re.sub('[^a-zA-Z]', ' ', data['message'][i])
    word_process = word_process.lower()
    word_process = word_process.split()
    word_process = [lemma.lemmatize(words) for words in word_process if not words in stopwords.words('english')]
    word_process = ' '.join(word_process)
    corpus.append(word_process)

### Because this is a large data set we will apply either Word2vec or tfidf vectorization

In [9]:
len(corpus)

5572

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
tfidf = TfidfVectorizer(max_features = 2500)
X = tfidf.fit_transform(corpus).toarray()

### Because we had the label as 'ham' or 'spam', so now we will convert them into 0 or 1 using pd.get_dummies

In [12]:
y=pd.get_dummies(data['label'])
y.shape

(5572, 2)

In [22]:
# we will just take the spam class, as it is a binary spam classifier and it will help us to classify into spam or not in either way.

y = y.iloc[:,1].values

### making the train test Split and using Multinomial NB to get the model to classify into Spam or not.

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20, random_state=0)

In [25]:
y_train

array([0, 0, 0, ..., 1, 0, 0], dtype=uint8)

In [26]:
# Naive bayes works well with NLP problem Statements.

from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

y_pred

array([0, 1, 0, ..., 0, 1, 0], dtype=uint8)

In [27]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [28]:
score = accuracy_score(y_test, y_pred)
matrix = confusion_matrix(y_test, y_pred)

In [29]:
print('Accuracy-score: {}'.format(score))
print('Matrix : {}'.format(matrix))

Accuracy-score: 0.979372197309417
Matrix : [[954   1]
 [ 22 138]]


## Applying Countvectorizer

In [31]:
from sklearn.feature_extraction.text import CountVectorizer

In [32]:
cv = CountVectorizer(max_features = 5200)
X_cv = cv.fit_transform(corpus).toarray()

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X_cv,y, test_size=0.20, random_state=0)

In [34]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

y_pred

array([0, 1, 0, ..., 0, 1, 0], dtype=uint8)

In [35]:
score = accuracy_score(y_test, y_pred)
matrix = confusion_matrix(y_test, y_pred)

print('Accuracy-score: {}'.format(score))
print('Matrix : {}'.format(matrix))

Accuracy-score: 0.9820627802690582
Matrix : [[944  11]
 [  9 151]]


In [36]:
final_compare = pd.DataFrame()

In [41]:
final_compare['y_actual'] = y_test
final_compare['y_predicted'] = y_pred

In [45]:
final_compare.drop(['y_Actual'], axis=1, inplace=True)

In [46]:
final_compare.head(20)

Unnamed: 0,y_actual,y_predicted
0,0,0
1,1,1
2,0,0
3,0,0
4,0,0
5,0,0
6,1,1
7,0,0
8,0,0
9,0,0


# Make a CSV File

In [49]:
final_compare.to_csv('D:\ML_workspace\interview_nlp\Spam_classsify.csv', index = False)