In [1]:
# importing basic libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# importing our dataset

messages=pd.read_csv('SMSSpamCollection.csv')
messages.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Data Cleaning and Preprocessing

In [3]:
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [4]:
lemmatizer=WordNetLemmatizer()

In [6]:
corpus=[]
for i in range(len(messages)):
    review=re.sub('[^a-zA-Z]', ' ', messages['Message'][i])
    review=review.lower()
    review=review.split()
    review=[lemmatizer.lemmatize(word) for word in review if word not in stopwords.words()]
    review=' '.join(review)
    corpus.append(review)

In [9]:
# creating our Bag of Words Model

from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=100)

In [10]:
X=cv.fit_transform(corpus).toarray()

In [11]:
cv.vocabulary_

{'great': np.int64(25),
 'free': np.int64(19),
 'win': np.int64(94),
 'st': np.int64(76),
 'text': np.int64(78),
 'txt': np.int64(86),
 'dun': np.int64(15),
 'life': np.int64(39),
 'hey': np.int64(33),
 'week': np.int64(93),
 'word': np.int64(96),
 'back': np.int64(2),
 'send': np.int64(68),
 'friend': np.int64(20),
 'customer': np.int64(12),
 'prize': np.int64(66),
 'claim': np.int64(8),
 'call': np.int64(5),
 'hour': np.int64(36),
 'mobile': np.int64(49),
 'month': np.int64(51),
 'co': np.int64(9),
 'gonna': np.int64(24),
 'home': np.int64(34),
 'tonight': np.int64(85),
 'today': np.int64(82),
 'cash': np.int64(6),
 'day': np.int64(13),
 'reply': np.int64(67),
 'urgent': np.int64(89),
 'www': np.int64(98),
 'time': np.int64(81),
 'message': np.int64(45),
 'make': np.int64(43),
 'fine': np.int64(18),
 'feel': np.int64(16),
 'miss': np.int64(48),
 'ur': np.int64(88),
 'meet': np.int64(44),
 'love': np.int64(41),
 'amp': np.int64(0),
 'work': np.int64(97),
 'wait': np.int64(90),
 'uk': 

In [14]:
# Creating a Bigram

from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=200, ngram_range=(1,2))
X=cv.fit_transform(corpus)
cv.vocabulary_

{'point': np.int64(132),
 'great': np.int64(58),
 'world': np.int64(195),
 'lar': np.int64(82),
 'free': np.int64(50),
 'win': np.int64(190),
 'st': np.int64(161),
 'text': np.int64(167),
 'receive': np.int64(143),
 'question': np.int64(138),
 'txt': np.int64(176),
 'rate': np.int64(139),
 'apply': np.int64(4),
 'dun': np.int64(40),
 'early': np.int64(42),
 'life': np.int64(88),
 'hey': np.int64(69),
 'week': np.int64(188),
 'word': np.int64(193),
 'back': np.int64(8),
 'fun': np.int64(52),
 'xxx': np.int64(198),
 'send': np.int64(149),
 'speak': np.int64(159),
 'friend': np.int64(51),
 'network': np.int64(112),
 'customer': np.int64(34),
 'prize': np.int64(135),
 'claim': np.int64(26),
 'call': np.int64(18),
 'hour': np.int64(73),
 'mobile': np.int64(107),
 'month': np.int64(109),
 'latest': np.int64(84),
 'camera': np.int64(20),
 'co': np.int64(28),
 'gonna': np.int64(57),
 'home': np.int64(71),
 'talk': np.int64(166),
 'stuff': np.int64(164),
 'tonight': np.int64(174),
 'today': np.

In [15]:
# Creating a (Bigram, Trigram)

from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=500, ngram_range=(2,3))
X=cv.fit_transform(corpus)
cv.vocabulary_

{'free entry': np.int64(154),
 'rate apply': np.int64(330),
 'claim call': np.int64(77),
 'call claim': np.int64(33),
 'claim code': np.int64(78),
 'call claim code': np.int64(34),
 'entitled update': np.int64(138),
 'update latest': np.int64(431),
 'latest colour': np.int64(217),
 'free call': np.int64(150),
 'call mobile': np.int64(46),
 'mobile update': np.int64(267),
 'entitled update latest': np.int64(139),
 'update latest colour': np.int64(432),
 'free call mobile': np.int64(151),
 'call mobile update': np.int64(47),
 'chance win': np.int64(75),
 'win cash': np.int64(479),
 'reply hl': np.int64(336),
 'hl info': np.int64(191),
 'chance win cash': np.int64(76),
 'reply hl info': np.int64(337),
 'txt word': np.int64(425),
 'mobile charged': np.int64(262),
 'call reply': np.int64(53),
 'nokia mobile': np.int64(280),
 'mobile free': np.int64(263),
 'free camcorder': np.int64(152),
 'delivery tomorrow': np.int64(126),
 'lt gt': np.int64(234),
 'missed call': np.int64(259),
 'sm ac': n

### Creating our TF-IDF Model

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(max_features=100)
X=tfidf.fit_transform(corpus).toarray()

In [20]:
import numpy as np

np.set_printoptions(edgeitems=30, linewidth=100000, formatter=dict(float= lambda x: "%.3g" % x))

In [21]:
X

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.387, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0.529, 0, 0.401, 0, 0, 0, 0, 0, 0, 0, 0.418, 0, 0, 0, 0, 0, 0, 0, 0.485, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0,

In [22]:
# Applying N-Grams to TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(max_features=100, ngram_range=(2,2))
X=tfidf.fit_transform(corpus).toarray()

In [23]:
tfidf.vocabulary_

{'free entry': np.int64(33),
 'claim call': np.int64(18),
 'call claim': np.int64(5),
 'claim code': np.int64(19),
 'free call': np.int64(32),
 'chance win': np.int64(17),
 'txt word': np.int64(86),
 'mobile free': np.int64(54),
 'lt gt': np.int64(53),
 'sm ac': np.int64(77),
 'sorry call': np.int64(78),
 'ur awarded': np.int64(87),
 'call free': np.int64(7),
 'call customer': np.int64(6),
 'customer service': np.int64(25),
 'guaranteed cash': np.int64(40),
 'cash prize': np.int64(16),
 'draw show': np.int64(29),
 'show prize': np.int64(75),
 'prize guaranteed': np.int64(66),
 'guaranteed call': np.int64(39),
 'valid hr': np.int64(97),
 'selected receive': np.int64(72),
 'private account': np.int64(64),
 'account statement': np.int64(0),
 'statement show': np.int64(80),
 'call identifier': np.int64(8),
 'identifier code': np.int64(48),
 'code expires': np.int64(23),
 'urgent mobile': np.int64(96),
 'bonus caller': np.int64(4),
 'caller prize': np.int64(13),
 'call landline': np.int64(1

In [24]:
X

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0