In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import nltk

# Load Text

In [2]:
sample_txt = """Britain's communications intelligence agency GCHQ has issued a statement
denying it wiretapped Donald Trump during the US presidential campaign. The
unusual move by the agency came after White House Press Secretary Sean
Spicer cited claims first made on US TV channel Fox News earlier this week.
GCHQ responded by saying the allegations were "nonsense, utterly ridiculous
and should be ignored". The claims Of GCHQ involvement Were initially made
by former judge Andrew Napolitano. Mr Spicer quoted Mr Napolitano as
saying: "Three intelligence Sources have informed Fox News that president
Obama went outside the chain of command. "
"""

# Transform to Lowercase

In [3]:
sample_txt = sample_txt.lower()
sample_txt

'britain\'s communications intelligence agency gchq has issued a statement\ndenying it wiretapped donald trump during the us presidential campaign. the\nunusual move by the agency came after white house press secretary sean\nspicer cited claims first made on us tv channel fox news earlier this week.\ngchq responded by saying the allegations were "nonsense, utterly ridiculous\nand should be ignored". the claims of gchq involvement were initially made\nby former judge andrew napolitano. mr spicer quoted mr napolitano as\nsaying: "three intelligence sources have informed fox news that president\nobama went outside the chain of command. "\n'

# Remove Numbers

In [4]:
import re

sample_txt = re.sub(r'\d+', '', sample_txt)
sample_txt

'britain\'s communications intelligence agency gchq has issued a statement\ndenying it wiretapped donald trump during the us presidential campaign. the\nunusual move by the agency came after white house press secretary sean\nspicer cited claims first made on us tv channel fox news earlier this week.\ngchq responded by saying the allegations were "nonsense, utterly ridiculous\nand should be ignored". the claims of gchq involvement were initially made\nby former judge andrew napolitano. mr spicer quoted mr napolitano as\nsaying: "three intelligence sources have informed fox news that president\nobama went outside the chain of command. "\n'

# Tokenize

## Sentence Tokenize

In [5]:
from nltk import sent_tokenize
sample_sentences = sent_tokenize(sample_txt)
print(sample_sentences[0])

britain's communications intelligence agency gchq has issued a statement
denying it wiretapped donald trump during the us presidential campaign.


## Word Tokenize

In [6]:
from nltk import word_tokenize
sample_words = word_tokenize(sample_txt)
print(sample_words)

['britain', "'s", 'communications', 'intelligence', 'agency', 'gchq', 'has', 'issued', 'a', 'statement', 'denying', 'it', 'wiretapped', 'donald', 'trump', 'during', 'the', 'us', 'presidential', 'campaign', '.', 'the', 'unusual', 'move', 'by', 'the', 'agency', 'came', 'after', 'white', 'house', 'press', 'secretary', 'sean', 'spicer', 'cited', 'claims', 'first', 'made', 'on', 'us', 'tv', 'channel', 'fox', 'news', 'earlier', 'this', 'week', '.', 'gchq', 'responded', 'by', 'saying', 'the', 'allegations', 'were', '``', 'nonsense', ',', 'utterly', 'ridiculous', 'and', 'should', 'be', 'ignored', "''", '.', 'the', 'claims', 'of', 'gchq', 'involvement', 'were', 'initially', 'made', 'by', 'former', 'judge', 'andrew', 'napolitano', '.', 'mr', 'spicer', 'quoted', 'mr', 'napolitano', 'as', 'saying', ':', '``', 'three', 'intelligence', 'sources', 'have', 'informed', 'fox', 'news', 'that', 'president', 'obama', 'went', 'outside', 'the', 'chain', 'of', 'command.', '``']


# Remove Punctuation

In [7]:
import string

print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [8]:
table = str.maketrans('', '', string.punctuation)
sample_words = [str(w).translate(table) for w in sample_words]
print(sample_words)

['britain', 's', 'communications', 'intelligence', 'agency', 'gchq', 'has', 'issued', 'a', 'statement', 'denying', 'it', 'wiretapped', 'donald', 'trump', 'during', 'the', 'us', 'presidential', 'campaign', '', 'the', 'unusual', 'move', 'by', 'the', 'agency', 'came', 'after', 'white', 'house', 'press', 'secretary', 'sean', 'spicer', 'cited', 'claims', 'first', 'made', 'on', 'us', 'tv', 'channel', 'fox', 'news', 'earlier', 'this', 'week', '', 'gchq', 'responded', 'by', 'saying', 'the', 'allegations', 'were', '', 'nonsense', '', 'utterly', 'ridiculous', 'and', 'should', 'be', 'ignored', '', '', 'the', 'claims', 'of', 'gchq', 'involvement', 'were', 'initially', 'made', 'by', 'former', 'judge', 'andrew', 'napolitano', '', 'mr', 'spicer', 'quoted', 'mr', 'napolitano', 'as', 'saying', '', '', 'three', 'intelligence', 'sources', 'have', 'informed', 'fox', 'news', 'that', 'president', 'obama', 'went', 'outside', 'the', 'chain', 'of', 'command', '']


# Filter Stop Words

In [9]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(stopwords)

<WordListCorpusReader in 'C:\\Users\\Abdullah\\AppData\\Roaming\\nltk_data\\corpora\\stopwords'>


In [10]:
stop_words = set(stopwords.words('english'))
sample_words = [w for w in sample_words if not w in stop_words]
print(sample_words)

['britain', 'communications', 'intelligence', 'agency', 'gchq', 'issued', 'statement', 'denying', 'wiretapped', 'donald', 'trump', 'us', 'presidential', 'campaign', '', 'unusual', 'move', 'agency', 'came', 'white', 'house', 'press', 'secretary', 'sean', 'spicer', 'cited', 'claims', 'first', 'made', 'us', 'tv', 'channel', 'fox', 'news', 'earlier', 'week', '', 'gchq', 'responded', 'saying', 'allegations', '', 'nonsense', '', 'utterly', 'ridiculous', 'ignored', '', '', 'claims', 'gchq', 'involvement', 'initially', 'made', 'former', 'judge', 'andrew', 'napolitano', '', 'mr', 'spicer', 'quoted', 'mr', 'napolitano', 'saying', '', '', 'three', 'intelligence', 'sources', 'informed', 'fox', 'news', 'president', 'obama', 'went', 'outside', 'chain', 'command', '']


# Stem Words - Root Form (Might have no meaning)

In [11]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
stem_words = [porter.stem(word) for word in sample_words]
print(stem_words)

['britain', 'commun', 'intellig', 'agenc', 'gchq', 'issu', 'statement', 'deni', 'wiretap', 'donald', 'trump', 'us', 'presidenti', 'campaign', '', 'unusu', 'move', 'agenc', 'came', 'white', 'hous', 'press', 'secretari', 'sean', 'spicer', 'cite', 'claim', 'first', 'made', 'us', 'tv', 'channel', 'fox', 'news', 'earlier', 'week', '', 'gchq', 'respond', 'say', 'alleg', '', 'nonsens', '', 'utterli', 'ridicul', 'ignor', '', '', 'claim', 'gchq', 'involv', 'initi', 'made', 'former', 'judg', 'andrew', 'napolitano', '', 'mr', 'spicer', 'quot', 'mr', 'napolitano', 'say', '', '', 'three', 'intellig', 'sourc', 'inform', 'fox', 'news', 'presid', 'obama', 'went', 'outsid', 'chain', 'command', '']


# Lemmatization (Returns proper words)

In [12]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

lemm_words = []
for word in sample_words:
    print(lemmatizer.lemmatize(word))
    lemm_words.append(lemmatizer.lemmatize(word))
    lemm_words.append(" ")
    
print(lemm_words)

britain
communication
intelligence
agency
gchq
issued
statement
denying
wiretapped
donald
trump
u
presidential
campaign

unusual
move
agency
came
white
house
press
secretary
sean
spicer
cited
claim
first
made
u
tv
channel
fox
news
earlier
week

gchq
responded
saying
allegation

nonsense

utterly
ridiculous
ignored


claim
gchq
involvement
initially
made
former
judge
andrew
napolitano

mr
spicer
quoted
mr
napolitano
saying


three
intelligence
source
informed
fox
news
president
obama
went
outside
chain
command

['britain', ' ', 'communication', ' ', 'intelligence', ' ', 'agency', ' ', 'gchq', ' ', 'issued', ' ', 'statement', ' ', 'denying', ' ', 'wiretapped', ' ', 'donald', ' ', 'trump', ' ', 'u', ' ', 'presidential', ' ', 'campaign', ' ', '', ' ', 'unusual', ' ', 'move', ' ', 'agency', ' ', 'came', ' ', 'white', ' ', 'house', ' ', 'press', ' ', 'secretary', ' ', 'sean', ' ', 'spicer', ' ', 'cited', ' ', 'claim', ' ', 'first', ' ', 'made', ' ', 'u', ' ', 'tv', ' ', 'channel', ' ', 'fox'

# Bag of Words Model (Stand-alone words)

## Count Vectorizer (Does not give more weightage to imp words)

In [13]:
# Takes care of removing stopwords, lemmatization etc.
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

vectorizer.fit(sample_sentences)

#print(vectorizer.vocabulary_)
print({k: v for k, v in sorted(vectorizer.vocabulary_.items(), key=lambda item: item[1])}) # Sorted
#print(vectorizer.get_feature_names())

vector = vectorizer.transform(sample_sentences)

print(vector.shape)
print(type(vector))
print(vector.toarray())

{'after': 0, 'agency': 1, 'allegations': 2, 'and': 3, 'andrew': 4, 'as': 5, 'be': 6, 'britain': 7, 'by': 8, 'came': 9, 'campaign': 10, 'chain': 11, 'channel': 12, 'cited': 13, 'claims': 14, 'command': 15, 'communications': 16, 'denying': 17, 'donald': 18, 'during': 19, 'earlier': 20, 'first': 21, 'former': 22, 'fox': 23, 'gchq': 24, 'has': 25, 'have': 26, 'house': 27, 'ignored': 28, 'informed': 29, 'initially': 30, 'intelligence': 31, 'involvement': 32, 'issued': 33, 'it': 34, 'judge': 35, 'made': 36, 'move': 37, 'mr': 38, 'napolitano': 39, 'news': 40, 'nonsense': 41, 'obama': 42, 'of': 43, 'on': 44, 'outside': 45, 'president': 46, 'presidential': 47, 'press': 48, 'quoted': 49, 'responded': 50, 'ridiculous': 51, 'saying': 52, 'sean': 53, 'secretary': 54, 'should': 55, 'sources': 56, 'spicer': 57, 'statement': 58, 'that': 59, 'the': 60, 'this': 61, 'three': 62, 'trump': 63, 'tv': 64, 'unusual': 65, 'us': 66, 'utterly': 67, 'week': 68, 'went': 69, 'were': 70, 'white': 71, 'wiretapped': 7

## Testing

In [14]:
test_sentence = ["""There are rumours that Sean Spicer has been involved in utterly ridiculous activities - Bill Spicer"""]

# Vectorizer already fitted on our previous sample text
vector = vectorizer.transform(test_sentence)
print(vector.toarray())

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 2 0 1 0 0 0 0 0 0 0 1 0 0 0 0
  0]]


# TF - IDF Vectorizer (Freq of word in sentence X freq in other sentences)

## *Best way*

In [15]:
test_sentence = ["""There are rumours that Sean Spicer has been involved in utterly ridiculous activities - Bill Spicer"""]

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

vectorizer.fit(sample_sentences)

#print(vectorizer.vocabulary_)
print({k: v for k, v in sorted(vectorizer.vocabulary_.items(), key=lambda item: item[1])}) # Sorted

print("\n")

print(vectorizer.idf_)

vector = vectorizer.transform(test_sentence)

print("Vector shape \n", vector.shape)
print(vector.toarray())

{'after': 0, 'agency': 1, 'allegations': 2, 'and': 3, 'andrew': 4, 'as': 5, 'be': 6, 'britain': 7, 'by': 8, 'came': 9, 'campaign': 10, 'chain': 11, 'channel': 12, 'cited': 13, 'claims': 14, 'command': 15, 'communications': 16, 'denying': 17, 'donald': 18, 'during': 19, 'earlier': 20, 'first': 21, 'former': 22, 'fox': 23, 'gchq': 24, 'has': 25, 'have': 26, 'house': 27, 'ignored': 28, 'informed': 29, 'initially': 30, 'intelligence': 31, 'involvement': 32, 'issued': 33, 'it': 34, 'judge': 35, 'made': 36, 'move': 37, 'mr': 38, 'napolitano': 39, 'news': 40, 'nonsense': 41, 'obama': 42, 'of': 43, 'on': 44, 'outside': 45, 'president': 46, 'presidential': 47, 'press': 48, 'quoted': 49, 'responded': 50, 'ridiculous': 51, 'saying': 52, 'sean': 53, 'secretary': 54, 'should': 55, 'sources': 56, 'spicer': 57, 'statement': 58, 'that': 59, 'the': 60, 'this': 61, 'three': 62, 'trump': 63, 'tv': 64, 'unusual': 65, 'us': 66, 'utterly': 67, 'week': 68, 'went': 69, 'were': 70, 'white': 71, 'wiretapped': 7

# Hash Vectorizer (Two words might be hashed to same number)

In [17]:
from sklearn.feature_extraction.text import HashingVectorizer

vectorizer = HashingVectorizer(n_features=50)

vector = vectorizer.transform(sample_sentences)

print(vector.shape)
print(vector.toarray())

vectorizer

(5, 50)
[[ 0.          0.          0.          0.          0.          0.
   0.          0.         -0.25        0.25       -0.25        0.
   0.25        0.          0.          0.          0.          0.25
   0.25        0.          0.          0.25        0.          0.25
   0.          0.          0.          0.          0.         -0.25
   0.          0.          0.25       -0.25        0.          0.
   0.          0.25        0.         -0.25        0.          0.
  -0.25        0.25        0.          0.         -0.25        0.
   0.          0.        ]
 [ 0.          0.          0.          0.         -0.20851441 -0.20851441
   0.          0.          0.         -0.20851441  0.20851441 -0.41702883
  -0.20851441  0.          0.20851441 -0.20851441  0.          0.
   0.          0.         -0.20851441  0.          0.          0.
   0.         -0.20851441  0.          0.         -0.20851441  0.
  -0.20851441 -0.20851441  0.20851441  0.          0.         -0.20851441
   0.      

# Model using Logistic Regression

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

stop_words = set(stopwords.words('english'))

In [19]:
from sklearn.model_selection import train_test_split

data_df = pd.read_csv("spam.csv")
data_df.head(3)

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [20]:
data_X = data_df[['text']].values
data_y = data_df[['label']].values
X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.2)

In [21]:
train_X = [x[0].strip() for x in X_train.tolist()]
test_X = [x[0].strip() for x in X_test.tolist()]

In [22]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('clf', LogisticRegression())])
parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1)]
}

In [23]:
grid_search_tune = GridSearchCV(
    pipeline, parameters, cv=2, n_jobs=2, verbose=3)
grid_search_tune.fit(train_X, y_train)

Fitting 2 folds for each of 3 candidates, totalling 6 fits


  y = column_or_1d(y, warn=True)


In [24]:
print("Best parameters set:")
print(grid_search_tune.best_params_)

Best parameters set:
{'tfidf__max_df': 0.25, 'tfidf__ngram_range': (1, 1)}


In [25]:
from sklearn import metrics

best_clf = grid_search_tune.best_estimator_
predictions = best_clf.predict(test_X)

print(metrics.classification_report(y_test, predictions))

              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       958
        spam       0.99      0.79      0.88       157

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



# Test Predictions

In [26]:
best_clf.predict(["Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"])

array(['spam'], dtype=object)

In [27]:
best_clf.predict(["Reply to claim your reward of $20,000"])

array(['spam'], dtype=object)

In [28]:
best_clf.predict(["This is to inform you that you have a meeting scheduled today at 4:00 pm"])

array(['ham'], dtype=object)

In [29]:
best_clf.predict(["Congratulations on successfully completing an online course on Machine Learning Application offered by Great Learning Academy."])

array(['ham'], dtype=object)