# Apply TF-IDF For Spam Mesaages

## Clean Dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
messages = pd.read_csv('/content/SMSSpamCollection.tsv', sep='\t', names=['label', 'message'])

In [4]:
messages.head()

Unnamed: 0,label,message
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [17]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [19]:
wordLemmatize = WordNetLemmatizer()

In [20]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    review = [wordLemmatize.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [21]:
corpus[:10]

['searching right word thank breather promise wont take help granted fulfil promise wonderful blessing time',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'nah think go usf life around though',
 'even brother like speak treat like aid patent',
 'date sunday',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press copy friend callertune',
 'winner valued network customer selected receivea prize reward claim call claim code kl valid hour',
 'mobile month u r entitled update latest colour mobile camera free call mobile update co free',
 'gonna home soon want talk stuff anymore tonight k cried enough today',
 'six chance win cash pound txt csh send cost p day day tsandcs apply reply hl info']

## Create TF-IDF And N-Grams

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
tfIdf= TfidfVectorizer(max_features=1000)

In [24]:
X= tfIdf.fit_transform(corpus).toarray()

In [25]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### Apply Ngram Parameter

In [26]:
tfIdf = TfidfVectorizer(max_features=1000, ngram_range=(2,2))
X= tfIdf.fit_transform(corpus).toarray()

In [27]:
tfIdf.vocabulary_

{'free entry': np.int64(248),
 'entry wkly': np.int64(214),
 'wkly comp': np.int64(972),
 'cup final': np.int64(171),
 'std txt': np.int64(773),
 'txt rate': np.int64(855),
 'rate apply': np.int64(659),
 'per request': np.int64(617),
 'set callertune': np.int64(718),
 'callertune caller': np.int64(88),
 'caller press': np.int64(86),
 'press copy': np.int64(650),
 'copy friend': np.int64(160),
 'friend callertune': np.int64(261),
 'claim call': np.int64(110),
 'call claim': np.int64(61),
 'claim code': np.int64(111),
 'entitled update': np.int64(212),
 'update latest': np.int64(871),
 'latest colour': np.int64(424),
 'free call': np.int64(244),
 'call mobile': np.int64(74),
 'mobile update': np.int64(525),
 'update co': np.int64(870),
 'co free': np.int64(122),
 'chance win': np.int64(107),
 'win cash': np.int64(961),
 'reply hl': np.int64(674),
 'hl info': np.int64(383),
 'week free': np.int64(944),
 'txt word': np.int64(858),
 'dont miss': np.int64(198),
 'ha ha': np.int64(359),
 'let