# Text Preprocessing:

In [69]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import ngrams
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [44]:
input_str = "John Wick (retrospectively known as John Wick: Chapter 1) is a 2014 American neo-noir action thriller film directed by Chad Stahelski and written by Derek Kolstad. It stars Keanu Reeves, Michael Nyqvist, Alfie Allen, Adrianne Palicki, Bridget Moynahan, Dean Winters, Ian McShane, John Leguizamo, and Willem Dafoe.It grossed $88 million worldwide against a production budget of $20 million."
input_str

'John Wick (retrospectively known as John Wick: Chapter 1) is a 2014 American neo-noir action thriller film directed by Chad Stahelski and written by Derek Kolstad. It stars Keanu Reeves, Michael Nyqvist, Alfie Allen, Adrianne Palicki, Bridget Moynahan, Dean Winters, Ian McShane, John Leguizamo, and Willem Dafoe.It grossed $88 million worldwide against a production budget of $20 million.'

### Convert text to lowercase

In [45]:
lower_str = input_str.lower()
lower_str

'john wick (retrospectively known as john wick: chapter 1) is a 2014 american neo-noir action thriller film directed by chad stahelski and written by derek kolstad. it stars keanu reeves, michael nyqvist, alfie allen, adrianne palicki, bridget moynahan, dean winters, ian mcshane, john leguizamo, and willem dafoe.it grossed $88 million worldwide against a production budget of $20 million.'

### Remove punctuation

In [46]:
punc_free = input_str.translate(str.maketrans('', '', string.punctuation))
punc_free

'John Wick retrospectively known as John Wick Chapter 1 is a 2014 American neonoir action thriller film directed by Chad Stahelski and written by Derek Kolstad It stars Keanu Reeves Michael Nyqvist Alfie Allen Adrianne Palicki Bridget Moynahan Dean Winters Ian McShane John Leguizamo and Willem DafoeIt grossed 88 million worldwide against a production budget of 20 million'

### Remove stop words

Remove stop words using Natural Language Toolkit (NLTK), a suite of libraries and programs for symbolic and statistical natural language  processing.

In [47]:
# setting stop words for English language
stop_words = set(stopwords.words('english'))

tokens = word_tokenize(punc_free)
result = [i for i in tokens if not i in stop_words]
result

['John',
 'Wick',
 'retrospectively',
 'known',
 'John',
 'Wick',
 'Chapter',
 '1',
 '2014',
 'American',
 'neonoir',
 'action',
 'thriller',
 'film',
 'directed',
 'Chad',
 'Stahelski',
 'written',
 'Derek',
 'Kolstad',
 'It',
 'stars',
 'Keanu',
 'Reeves',
 'Michael',
 'Nyqvist',
 'Alfie',
 'Allen',
 'Adrianne',
 'Palicki',
 'Bridget',
 'Moynahan',
 'Dean',
 'Winters',
 'Ian',
 'McShane',
 'John',
 'Leguizamo',
 'Willem',
 'DafoeIt',
 'grossed',
 '88',
 'million',
 'worldwide',
 'production',
 'budget',
 '20',
 'million']

### Stemming

 Process of reducing words to their word stem, base or root form 

In [49]:
stemmer = PorterStemmer()
stem_str = word_tokenize(punc_free)
for word in stem_str:
    print(stemmer.stem(word))

john
wick
retrospect
known
as
john
wick
chapter
1
is
a
2014
american
neonoir
action
thriller
film
direct
by
chad
stahelski
and
written
by
derek
kolstad
It
star
keanu
reev
michael
nyqvist
alfi
allen
adriann
palicki
bridget
moynahan
dean
winter
ian
mcshane
john
leguizamo
and
willem
dafoeit
gross
88
million
worldwid
against
a
product
budget
of
20
million


### Lemmatization

Uses lexical knowledge bases to get the correct base forms of words

In [51]:
lemmatizer = WordNetLemmatizer()
lemm_str = word_tokenize(punc_free)
for word in lemm_str:
    print(lemmatizer.lemmatize(word))

John
Wick
retrospectively
known
a
John
Wick
Chapter
1
is
a
2014
American
neonoir
action
thriller
film
directed
by
Chad
Stahelski
and
written
by
Derek
Kolstad
It
star
Keanu
Reeves
Michael
Nyqvist
Alfie
Allen
Adrianne
Palicki
Bridget
Moynahan
Dean
Winters
Ian
McShane
John
Leguizamo
and
Willem
DafoeIt
grossed
88
million
worldwide
against
a
production
budget
of
20
million


### Tokenization

 Process of splitting the given text into smaller pieces called tokens

In [52]:
tokens = word_tokenize(input_str)
tokens

['John',
 'Wick',
 '(',
 'retrospectively',
 'known',
 'as',
 'John',
 'Wick',
 ':',
 'Chapter',
 '1',
 ')',
 'is',
 'a',
 '2014',
 'American',
 'neo-noir',
 'action',
 'thriller',
 'film',
 'directed',
 'by',
 'Chad',
 'Stahelski',
 'and',
 'written',
 'by',
 'Derek',
 'Kolstad',
 '.',
 'It',
 'stars',
 'Keanu',
 'Reeves',
 ',',
 'Michael',
 'Nyqvist',
 ',',
 'Alfie',
 'Allen',
 ',',
 'Adrianne',
 'Palicki',
 ',',
 'Bridget',
 'Moynahan',
 ',',
 'Dean',
 'Winters',
 ',',
 'Ian',
 'McShane',
 ',',
 'John',
 'Leguizamo',
 ',',
 'and',
 'Willem',
 'Dafoe.It',
 'grossed',
 '$',
 '88',
 'million',
 'worldwide',
 'against',
 'a',
 'production',
 'budget',
 'of',
 '$',
 '20',
 'million',
 '.']

### N-grams

In [59]:
n = 3
three_grams = ngrams(lower_str.split(), n)

for grams in three_grams:
    print(grams)

('john', 'wick', '(retrospectively')
('wick', '(retrospectively', 'known')
('(retrospectively', 'known', 'as')
('known', 'as', 'john')
('as', 'john', 'wick:')
('john', 'wick:', 'chapter')
('wick:', 'chapter', '1)')
('chapter', '1)', 'is')
('1)', 'is', 'a')
('is', 'a', '2014')
('a', '2014', 'american')
('2014', 'american', 'neo-noir')
('american', 'neo-noir', 'action')
('neo-noir', 'action', 'thriller')
('action', 'thriller', 'film')
('thriller', 'film', 'directed')
('film', 'directed', 'by')
('directed', 'by', 'chad')
('by', 'chad', 'stahelski')
('chad', 'stahelski', 'and')
('stahelski', 'and', 'written')
('and', 'written', 'by')
('written', 'by', 'derek')
('by', 'derek', 'kolstad.')
('derek', 'kolstad.', 'it')
('kolstad.', 'it', 'stars')
('it', 'stars', 'keanu')
('stars', 'keanu', 'reeves,')
('keanu', 'reeves,', 'michael')
('reeves,', 'michael', 'nyqvist,')
('michael', 'nyqvist,', 'alfie')
('nyqvist,', 'alfie', 'allen,')
('alfie', 'allen,', 'adrianne')
('allen,', 'adrianne', 'palicki,

### Document term matrix representation

In [68]:
docs =["john wick (retrospectively known as john wick: chapter 1) is a 2014 american neo-noir action thriller film directed by chad stahelski and written by derek kolstad",
       "it stars keanu reeves, michael nyqvist, alfie allen, adrianne palicki, bridget moynahan, dean winters, ian mcshane, john leguizamo, and willem dafoe" ,
       "it grossed $88 million worldwide against a production budget of $20 million"]
vec = CountVectorizer()
X = vec.fit_transform(docs)

df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
df

Unnamed: 0,20,2014,88,action,adrianne,against,alfie,allen,american,and,as,bridget,budget,by,chad,chapter,dafoe,dean,derek,directed,film,grossed,ian,is,it,john,keanu,known,kolstad,leguizamo,mcshane,michael,million,moynahan,neo,noir,nyqvist,of,palicki,production,reeves,retrospectively,stahelski,stars,thriller,wick,willem,winters,worldwide,written
0,0,1,0,1,0,0,0,0,1,1,1,0,0,2,1,1,0,0,1,1,1,0,0,1,0,2,0,1,1,0,0,0,0,0,1,1,0,0,0,0,0,1,1,0,1,2,0,0,0,1
1,0,0,0,0,1,0,1,1,0,1,0,1,0,0,0,0,1,1,0,0,0,0,1,0,1,1,1,0,0,1,1,1,0,1,0,0,1,0,1,0,1,0,0,1,0,0,1,1,0,0
2,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,2,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0


### TF-IDF

In [72]:
docs =["john wick (retrospectively known as john wick: chapter 1) is a 2014 american neo-noir action thriller film directed by chad stahelski and written by derek kolstad",
       "it stars keanu reeves, michael nyqvist, alfie allen, adrianne palicki, bridget moynahan, dean winters, ian mcshane, john leguizamo, and willem dafoe" ,
       "it grossed $88 million worldwide against a production budget of $20 million"]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)

df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
df

Unnamed: 0,20,2014,88,action,adrianne,against,alfie,allen,american,and,as,bridget,budget,by,chad,chapter,dafoe,dean,derek,directed,film,grossed,ian,is,it,john,keanu,known,kolstad,leguizamo,mcshane,michael,million,moynahan,neo,noir,nyqvist,of,palicki,production,reeves,retrospectively,stahelski,stars,thriller,wick,willem,winters,worldwide,written
0,0.0,0.186042,0.0,0.186042,0.0,0.0,0.0,0.0,0.186042,0.14149,0.186042,0.0,0.0,0.372084,0.186042,0.186042,0.0,0.0,0.186042,0.186042,0.186042,0.0,0.0,0.186042,0.0,0.28298,0.0,0.186042,0.186042,0.0,0.0,0.0,0.0,0.0,0.186042,0.186042,0.0,0.0,0.0,0.0,0.0,0.186042,0.186042,0.0,0.186042,0.372084,0.0,0.0,0.0,0.186042
1,0.0,0.0,0.0,0.0,0.225102,0.0,0.225102,0.225102,0.0,0.171196,0.0,0.225102,0.0,0.0,0.0,0.0,0.225102,0.225102,0.0,0.0,0.0,0.0,0.225102,0.0,0.171196,0.171196,0.225102,0.0,0.0,0.225102,0.225102,0.225102,0.0,0.225102,0.0,0.0,0.225102,0.0,0.225102,0.0,0.225102,0.0,0.0,0.225102,0.0,0.0,0.225102,0.225102,0.0,0.0
2,0.28196,0.0,0.28196,0.0,0.0,0.28196,0.0,0.0,0.0,0.0,0.0,0.0,0.28196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.28196,0.0,0.0,0.214438,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.56392,0.0,0.0,0.0,0.0,0.28196,0.0,0.28196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.28196,0.0
