### Word Vectorization - CountVectorizer/TfidfVectorizer

In [2]:
import pandas as pd

In [3]:
import numpy as np

In [4]:
from nltk import word_tokenize

In [5]:
review1 = "It is a good movie"

In [6]:
review2 = "It has some good part and boring part"

In [7]:
review3 = "It is a terrible movie"

### Tokenize the strings

In [8]:
review1_token = word_tokenize(review1)

In [9]:
review2_token = word_tokenize(review2)

In [10]:
review3_token = word_tokenize(review3)

In [11]:
review1_token

['It', 'is', 'a', 'good', 'movie']

In [12]:
review_tokens = set(review1_token).union(set(review2_token)).union(set(review3_token))

In [13]:
review_tokens

{'It',
 'a',
 'and',
 'boring',
 'good',
 'has',
 'is',
 'movie',
 'part',
 'some',
 'terrible'}

In [23]:
review1_dict = dict.fromkeys(review_tokens,0)

In [24]:
review2_dict = dict.fromkeys(review_tokens,0)

In [25]:
review3_dict = dict.fromkeys(review_tokens,0)

In [26]:
review1_dict

{'and': 0,
 'boring': 0,
 'terrible': 0,
 'some': 0,
 'It': 0,
 'has': 0,
 'part': 0,
 'movie': 0,
 'good': 0,
 'is': 0,
 'a': 0}

In [27]:
for token in review1_token:
    review1_dict[token] += 1

In [28]:
for token in review2_token:
    review2_dict[token] += 1

In [29]:
for token in review3_token:
    review3_dict[token] += 1

In [30]:
review1_dict

{'and': 0,
 'boring': 0,
 'terrible': 0,
 'some': 0,
 'It': 1,
 'has': 0,
 'part': 0,
 'movie': 1,
 'good': 1,
 'is': 1,
 'a': 1}

### Create Pandas Data Frame

In [31]:
reviews_Dict_DF = pd.DataFrame([review1_dict, review2_dict,review3_dict])

In [33]:
reviews_Dict_DF

Unnamed: 0,It,a,and,boring,good,has,is,movie,part,some,terrible
0,1,1,0,0,1,0,1,1,0,0,0
1,1,0,1,1,1,1,0,0,2,1,0
2,1,1,0,0,0,0,1,1,0,0,1


### Using CountVectorizer from ski-kit learn

In [37]:
from sklearn.feature_extraction.text import CountVectorizer

In [38]:
review_list = [review1, review2, review3]

In [39]:
count_vect = CountVectorizer()

In [42]:
X_counts = count_vect.fit_transform(review_list)

In [43]:
type(X_counts)

scipy.sparse.csr.csr_matrix

In [44]:
X_counts.shape

(3, 10)

In [45]:
X_names = count_vect.get_feature_names()

In [46]:
X_names

['and',
 'boring',
 'good',
 'has',
 'is',
 'it',
 'movie',
 'part',
 'some',
 'terrible']

In [48]:
pd_df = pd.DataFrame(X_counts.toarray(), columns=X_names)

In [49]:
pd_df

Unnamed: 0,and,boring,good,has,is,it,movie,part,some,terrible
0,0,0,1,0,1,1,1,0,0,0
1,1,1,1,1,0,1,0,2,1,0
2,0,0,0,0,1,1,1,0,0,1


In [50]:
X_counts.toarray()

array([[0, 0, 1, 0, 1, 1, 1, 0, 0, 0],
       [1, 1, 1, 1, 0, 1, 0, 2, 1, 0],
       [0, 0, 0, 0, 1, 1, 1, 0, 0, 1]], dtype=int64)

### Using TFidfVectorizer from ski-kit learn

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [52]:
tf_vect = TfidfVectorizer(min_df = 1,lowercase = True, stop_words = 'english')

In [53]:
tf_matrix = tf_vect.fit_transform(review_list)

In [54]:
type(tf_matrix)

scipy.sparse.csr.csr_matrix

In [55]:
tf_matrix.shape

(3, 4)

In [56]:
tf_names = tf_vect.get_feature_names()

In [57]:
tf_names

['boring', 'good', 'movie', 'terrible']

In [58]:
review_list

['It is a good movie',
 'It has some good part and boring part',
 'It is a terrible movie']

In [60]:
tf_df = pd.DataFrame(tf_matrix.toarray(),columns=tf_names)

In [61]:
tf_df

Unnamed: 0,boring,good,movie,terrible
0,0.0,0.707107,0.707107,0.0
1,0.795961,0.605349,0.0,0.0
2,0.0,0.0,0.605349,0.795961


In [62]:
tf_matrix[0]

<1x4 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [63]:
tf_vect.inverse_transform(tf_matrix[0])

[array(['good', 'movie'], dtype='<U8')]

In [64]:
review1_token

['It', 'is', 'a', 'good', 'movie']