# Converting words to features

In [11]:
import pandas as pd

In [12]:
import pandas as pd
import nltk
#nltk.download()
from nltk.stem.wordnet import WordNetLemmatizer

## Read the data into the notebook

In [13]:
movie = pd.read_csv('imdb1.csv')

## Removing stopwords

In [14]:
#remove stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [15]:
#Adding custom stop words
new_words=[]
new_words = ["some","one","like","time","br","movie","film","could","good",'even', 'get', 'would',
             'make', 'really', 'see', 'well', 'much', 'great', 'first', 'people', 'also', 'bad', 
             'show', 'way', 'thing', 'made', 'go', 'think', 'know', 'watch','look','many']
stop_words = stop_words.union(new_words)

## Perform text pre-processing in a loop for the whole corpus

In [16]:
#Number of reviews
movie.shape[0]

2000

In [17]:
corpus = []
for i in range(0, movie.shape[0]):
    #Remove punctuations
    text = re.sub('[^a-zA-Z0-9]', ' ', movie['review'][i])
    
    #Convert to lowercase
    text = text.lower()
    
    ##Convert to list from string
    text = text.split()
    
    ##Lemmatizing
    lm = WordNetLemmatizer() 
    text = [lm.lemmatize(word) for word in text if not word in stop_words] 
    text = " ".join(text)
    corpus.append(text)

In [18]:
corpus[300]

'hardly compare space adventure star war star trek fan star trek recognize pale comparison series trekkies ooze fact place back future better light term space travel story boy captured space raider pirate obviously fake unentertaining battle captured boy befriends pirate help slowly raider die end boy get return home last remaining pirate escape gravely wounded acting obvious total lack interesting dialogue effect storyline got 80 minute beyond want take shot involves space ahead warned'

## Creating the document term matrix - toy data

* Convert a collection of text documents to a matrix of token counts

* This implementation produces a sparse representation of the counts using scipy.sparse.csr_matrix.

* If you do not provide an a-priori dictionary and you do not use an analyzer that does some kind of feature selection then the number of features will be equal to the vocabulary size found by analyzing the data.

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
import re

In [20]:
#Lets understand count vectoriser
cv = CountVectorizer()

In [21]:
#Use fit transform to transform a text corpus to a count vectoriser
test_cv = cv.fit_transform(["A wonderful production", 
                            "This was a wonderful way", 
                            "wonderful portrait about human relations"])

test_cv

<3x9 sparse matrix of type '<class 'numpy.int64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [22]:
#View cv as an array
count_vect_array = test_cv.toarray()
count_vect_array

array([[0, 0, 0, 1, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 1, 1, 1, 1],
       [1, 1, 1, 0, 1, 0, 0, 0, 1]], dtype=int64)

In [23]:
#Get the feature names of cv
cv.get_feature_names()

['about',
 'human',
 'portrait',
 'production',
 'relations',
 'this',
 'was',
 'way',
 'wonderful']

In [24]:
### Convert to dataframe
count_vect_df = pd.DataFrame(count_vect_array, 
                             columns=cv.get_feature_names())
count_vect_df

Unnamed: 0,about,human,portrait,production,relations,this,was,way,wonderful
0,0,0,0,1,0,0,0,0,1
1,0,0,0,0,0,1,1,1,1
2,1,1,1,0,1,0,0,0,1


In [25]:
#How to get the word frequency for each term in the vocabulary
count_vect_df.sum(axis=0)

about         1
human         1
portrait      1
production    1
relations     1
this          1
was           1
way           1
wonderful     3
dtype: int64

### Creating N-grams

In [105]:
movie['review'] = movie['review'].astype('str')

In [109]:
#Tokenize
from nltk import word_tokenize 
tokens = word_tokenize(movie['review'][1])

In [110]:
from nltk.util import ngrams
bigrams = ngrams(tokens,2)

In [111]:
for b in bigrams:
    print(b)

('The', 'first')
('first', 'in')
('in', 'the')
('the', 'series')
('series', 'was')
('was', 'brilliant')
('brilliant', ',')
(',', 'easily')
('easily', 'one')
('one', 'of')
('of', 'the')
('the', 'best')
('best', 'Horror')
('Horror', 'films')
('films', 'of')
('of', 'all')
('all', 'time')
('time', '.')
('.', 'This')
('This', 'is')
('is', 'the')
('the', 'crappiest')
('crappiest', '.')
('.', 'When')
('When', 'I')
('I', 'sat')
('sat', 'down')
('down', 'to')
('to', 'watch')
('watch', 'this')
('this', ',')
(',', 'I')
('I', 'was')
('was', 'actually')
('actually', 'thinking')
('thinking', 'that')
('that', 'how')
('how', 'bad')
('bad', 'the')
('the', 'fourth')
('fourth', 'and')
('and', 'fifth')
('fifth', 'ones')
('ones', 'were')
('were', ',')
(',', 'this')
('this', 'would')
('would', 'have')
('have', 'to')
('to', 'be')
('be', 'good')
('good', 'after')
('after', 'the')
('the', 'previous')
('previous', 'terrible')
('terrible', 'ones')
('ones', '.')
('.', 'Boy')
('Boy', 'was')
('was', 'I')
('I', 'wro

In [112]:
trigrams = ngrams(tokens,3)

In [113]:
for b in trigrams:
    print(b)

('The', 'first', 'in')
('first', 'in', 'the')
('in', 'the', 'series')
('the', 'series', 'was')
('series', 'was', 'brilliant')
('was', 'brilliant', ',')
('brilliant', ',', 'easily')
(',', 'easily', 'one')
('easily', 'one', 'of')
('one', 'of', 'the')
('of', 'the', 'best')
('the', 'best', 'Horror')
('best', 'Horror', 'films')
('Horror', 'films', 'of')
('films', 'of', 'all')
('of', 'all', 'time')
('all', 'time', '.')
('time', '.', 'This')
('.', 'This', 'is')
('This', 'is', 'the')
('is', 'the', 'crappiest')
('the', 'crappiest', '.')
('crappiest', '.', 'When')
('.', 'When', 'I')
('When', 'I', 'sat')
('I', 'sat', 'down')
('sat', 'down', 'to')
('down', 'to', 'watch')
('to', 'watch', 'this')
('watch', 'this', ',')
('this', ',', 'I')
(',', 'I', 'was')
('I', 'was', 'actually')
('was', 'actually', 'thinking')
('actually', 'thinking', 'that')
('thinking', 'that', 'how')
('that', 'how', 'bad')
('how', 'bad', 'the')
('bad', 'the', 'fourth')
('the', 'fourth', 'and')
('fourth', 'and', 'fifth')
('and', 

# Create a vocabulary of words for the movie reviews

### Count vectoriser arguments

In [63]:
#ignore words that appear in 80% of documents, 
#eliminate stop words
cv=CountVectorizer(stop_words=stop_words)
X=cv.fit_transform(corpus)

In [64]:
print(X.shape)

(2000, 22296)


In [65]:
cv=CountVectorizer(stop_words=stop_words,
                  ngram_range=(1,3))
X=cv.fit_transform(corpus)

In [66]:
print(X.shape)

(2000, 392989)


## How to change the number of words in the vocabulary

### Directly provide a dimension using max_features

In [67]:
cv=CountVectorizer(max_features=10000,
                   stop_words=stop_words,
                  ngram_range=(1,3))
X=cv.fit_transform(corpus)

In [68]:
print(X.shape)

(2000, 10000)


### Ignore terms that are present in most documents (custom stopwords) - Using max_df

In [71]:
cv=CountVectorizer(max_df = 0.7,
                   stop_words=stop_words,
                  ngram_range=(1,3))
X=cv.fit_transform(corpus)

In [72]:
print(X.shape)

(2000, 392989)


### Ignore terms that are rare in the corpus - Using min_df

In [73]:
cv=CountVectorizer(min_df = 0.01,
                   stop_words=stop_words,
                  ngram_range=(1,3))
X=cv.fit_transform(corpus)

In [74]:
print(X.shape)

(2000, 1674)


### Let us view a part of the DTM

In [120]:
count_vect_array = X.toarray()


### Convert to dataframe

count_vect_df = pd.DataFrame(count_vect_array, columns=cv.get_feature_names())
count_vect_df.head()

Unnamed: 0,10,10 10,100,11,12,13,15,20,30,40,...,yes,yet,yet another,york,young,young man,younger,youth,zero,zombie
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## TF IDF Vectorizer

In [75]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Lets understand count vectoriser
tf = TfidfVectorizer()

In [76]:
#Use fir transform to transform a text corpus to a count vectoriser
test_tf = tf.fit_transform(["A wonderful production", 
                            "This was a wonderful way", 
                            "wonderful portrait about human relations"])

In [77]:
#View cv as an array
tf_array = test_tf.toarray()
tf_array

array([[0.        , 0.        , 0.        , 0.861037  , 0.        ,
        0.        , 0.        , 0.        , 0.50854232],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.54645401, 0.54645401, 0.54645401, 0.32274454],
       [0.47952794, 0.47952794, 0.47952794, 0.        , 0.47952794,
        0.        , 0.        , 0.        , 0.28321692]])

In [78]:
#Get the feature names of cv
tf.get_feature_names()

tf_df = pd.DataFrame(tf_array,columns=tf.get_feature_names() )
tf_df

Unnamed: 0,about,human,portrait,production,relations,this,was,way,wonderful
0,0.0,0.0,0.0,0.861037,0.0,0.0,0.0,0.0,0.508542
1,0.0,0.0,0.0,0.0,0.0,0.546454,0.546454,0.546454,0.322745
2,0.479528,0.479528,0.479528,0.0,0.479528,0.0,0.0,0.0,0.283217


## Let us compare this with the dataframe from count vectoriser

In [79]:
count_vect_df

Unnamed: 0,about,human,portrait,production,relations,this,was,way,wonderful
0,0,0,0,1,0,0,0,0,1
1,0,0,0,0,0,1,1,1,1
2,1,1,1,0,1,0,0,0,1


## Applying tf-idf vectoriser to the corpus

In [84]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Fit the tf-idf model
tfv = TfidfVectorizer(stop_words=stop_words, 
                      ngram_range=(1,3), 
                      min_df = 0.001)

In [85]:
# tokenize and build vocab
tfmat = tfv.fit_transform(corpus)

In [86]:
feature_names=tfv.get_feature_names()
#feature_names[:10]

In [87]:
len(feature_names)

23815

In [132]:
tfidf_vect_array = tfmat.toarray()


### Convert to dataframe

tfidf_vect_df = pd.DataFrame(tfidf_vect_array, columns=tfv.get_feature_names())
tfidf_vect_df.head()

Unnamed: 0,00,000,000 000,01,06,08,10,10 000,10 10,10 10 star,...,zip,zippy,zoey,zombie,zombie appear,zombie attack,zombie chronicle,zone,zone episode,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
