In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Using Bag of Words (BoW) Representation

In [2]:
import pandas as pd
import numpy as np
df = pd.read_csv('/content/drive/MyDrive/Machine_Learning/NLP - 1/Datasets _ Saved Models/IMDB-Movie-Data.csv')
df.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0


In [3]:
df.shape

(1000, 12)

In [4]:
type(df.loc[0, 'Genre'])

str

In [5]:
df.loc[0:5, 'Genre'].str.split(',')

0     [Action, Adventure, Sci-Fi]
1    [Adventure, Mystery, Sci-Fi]
2              [Horror, Thriller]
3     [Animation, Comedy, Family]
4    [Action, Adventure, Fantasy]
5    [Action, Adventure, Fantasy]
Name: Genre, dtype: object

We will use the function TransactionEncoder() from Machine Learning Extensions (mlxtend) to convert the unstructured data to a tabular (structured) representation.

In [6]:
#Note: The TransactionEncoder needs the input as a list of lists or an array of lists
data = df['Genre'].str.split(',') #This would be the input to mlxtend.TransactionEncoder
data

0       [Action, Adventure, Sci-Fi]
1      [Adventure, Mystery, Sci-Fi]
2                [Horror, Thriller]
3       [Animation, Comedy, Family]
4      [Action, Adventure, Fantasy]
                   ...             
995         [Crime, Drama, Mystery]
996                        [Horror]
997         [Drama, Music, Romance]
998             [Adventure, Comedy]
999       [Comedy, Family, Fantasy]
Name: Genre, Length: 1000, dtype: object

In [7]:
#This expects the input as an array of lists or as a list of lists
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
te.fit(data).transform(data)#Returns a boolean table

array([[ True,  True, False, ..., False, False, False],
       [False,  True, False, ..., False, False, False],
       [False, False, False, ...,  True, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False,  True, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [8]:
#Let's convert it to int
te.fit(data).transform(data).astype(int)

array([[1, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [9]:
te.columns_ #Unique column names (All genres determined from the input)

['Action',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Drama',
 'Family',
 'Fantasy',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Sport',
 'Thriller',
 'War',
 'Western']

In [11]:
tr_data = pd.DataFrame(te.fit(data).transform(data).astype('int'), columns= te.columns_)
tr_data
# 0 --> Absence of a feature
# 1 --> Presence

Unnamed: 0,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,Fantasy,History,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
3,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0
996,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
997,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0
998,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Note: TransactionEncoder() returns only the presence (True) or absence (False) for each column. Even if a word is repeated multiple times in a piece of text, it will just return a True for the word.

# Term Frequency (TF)

> To make a term frequency (count of each word) matrix, you can use CountVectorizer() from sklearn

> It is important to note that the CountVectorizer() expects the input as a list of strings or an array of strings.

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
# For a sample, Genre will not be repeated.
df.loc[:5,'Genre'].head()

0     Action,Adventure,Sci-Fi
1    Adventure,Mystery,Sci-Fi
2             Horror,Thriller
3     Animation,Comedy,Family
4    Action,Adventure,Fantasy
Name: Genre, dtype: object

In [14]:
# So, we will see it application on the Description of the movies.
df.loc[:5,'Description'].head() #This would be the input to CountVectorizer()

0    A group of intergalactic criminals are forced ...
1    Following clues to the origin of mankind, a te...
2    Three girls are kidnapped by a man with a diag...
3    In a city of humanoid animals, a hustling thea...
4    A secret government agency recruits some of th...
Name: Description, dtype: object

In [15]:
cv = CountVectorizer()
#df['Description'].values[1]
cv.fit(df.loc[:5,'Description'].values)
cv.transform(df.loc[:5,'Description'].values) # Returns a sparse array

<6x115 sparse matrix of type '<class 'numpy.int64'>'
	with 137 stored elements in Compressed Sparse Row format>

In [16]:
cv.transform(df['Description'].values).todense() #Convert it to dense representation

matrix([[0, 0, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [1, 1, 0, ..., 1, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [17]:
cv.vocabulary_# The unique words are stored in a dictionary
# The key is the word, the value is the index
# You can see that the dictionary is not sorted on the values (indexes).

{'group': 53,
 'of': 77,
 'intergalactic': 62,
 'criminals': 25,
 'are': 9,
 'forced': 45,
 'to': 104,
 'work': 113,
 'together': 105,
 'stop': 91,
 'fanatical': 37,
 'warrior': 110,
 'from': 48,
 'taking': 94,
 'control': 23,
 'the': 99,
 'universe': 107,
 'following': 42,
 'clues': 21,
 'origin': 79,
 'mankind': 67,
 'team': 96,
 'finds': 40,
 'structure': 92,
 'on': 78,
 'distant': 30,
 'moon': 71,
 'but': 17,
 'they': 102,
 'soon': 90,
 'realize': 82,
 'not': 76,
 'alone': 4,
 'three': 103,
 'girls': 49,
 'kidnapped': 64,
 'by': 18,
 'man': 66,
 'with': 112,
 'diagnosed': 29,
 '23': 0,
 'distinct': 31,
 'personalities': 80,
 'must': 73,
 'try': 106,
 'escape': 34,
 'before': 15,
 'apparent': 8,
 'emergence': 33,
 'frightful': 47,
 'new': 75,
 '24th': 1,
 'in': 60,
 'city': 20,
 'humanoid': 57,
 'animals': 5,
 'hustling': 58,
 'theater': 100,
 'impresario': 59,
 'attempt': 11,
 'save': 85,
 'his': 55,
 'singing': 88,
 'competition': 22,
 'becomes': 14,
 'grander': 51,
 'than': 97,
 

In [18]:
sorted(cv.vocabulary_)
# Sort it based on the values of the keys (ascending order)

['23',
 '24th',
 'against',
 'agency',
 'alone',
 'animals',
 'anticipates',
 'apocalypse',
 'apparent',
 'are',
 'as',
 'attempt',
 'be',
 'become',
 'becomes',
 'before',
 'black',
 'but',
 'by',
 'china',
 'city',
 'clues',
 'competition',
 'control',
 'creatures',
 'criminals',
 'dangerous',
 'defense',
 'defensive',
 'diagnosed',
 'distant',
 'distinct',
 'embroiled',
 'emergence',
 'escape',
 'european',
 'even',
 'fanatical',
 'finalists',
 'find',
 'finds',
 'first',
 'following',
 'for',
 'force',
 'forced',
 'form',
 'frightful',
 'from',
 'girls',
 'government',
 'grander',
 'great',
 'group',
 'he',
 'his',
 'horde',
 'humanoid',
 'hustling',
 'impresario',
 'in',
 'incarcerated',
 'intergalactic',
 'its',
 'kidnapped',
 'lives',
 'man',
 'mankind',
 'mercenaries',
 'mission',
 'monstrous',
 'moon',
 'most',
 'must',
 'never',
 'new',
 'not',
 'of',
 'on',
 'origin',
 'personalities',
 'powder',
 'realize',
 'recruits',
 'same',
 'save',
 'searching',
 'secret',
 'singing',

In [20]:
#Putting everything in a dataframe
str_df = pd.DataFrame(cv.transform(df['Description'].values).todense(), columns = sorted(cv.vocabulary_))
str_df

Unnamed: 0,23,24th,against,agency,alone,animals,anticipates,apocalypse,apparent,are,...,together,try,universe,villains,wall,warrior,will,with,work,world
0,0,0,0,0,0,0,0,0,0,1,...,1,0,1,0,0,1,0,0,1,0
1,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,1,1,...,0,1,0,0,0,0,0,1,0,0
3,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,1,1,0,0
4,0,0,0,1,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
996,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


# Using TF-IDF
CountVectorizer simply counts the number of times a word appears in a document (using a bag-of-words approach), while TF-IDF Vectorizer takes into account not only how many times a word appears in a document but also how important that word is to the whole corpus.

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
fidf = TfidfVectorizer()
fidf.fit(df.loc[:5,'Description'].values)
fidf.transform(df.loc[:5,'Description'].values).todense()

matrix([[0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.17765115,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.25660546, 0.        ,
         0.25660546, 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.25660546, 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.25660546, 0.        , 0.        , 0.21042015, 0.        ,
         0.        , 0.        , 0.        , 0.25660546, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.25660546, 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0

In [22]:
tfidf_df = pd.DataFrame(fidf.transform(df['Description'].values).todense(), columns = sorted(fidf.vocabulary_))
tfidf_df

Unnamed: 0,23,24th,against,agency,alone,animals,anticipates,apocalypse,apparent,are,...,together,try,universe,villains,wall,warrior,will,with,work,world
0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.177651,...,0.256605,0.000000,0.256605,0.000000,0.0,0.256605,0.000000,0.000000,0.256605,0.000000
1,0.000000,0.000000,0.0,0.000000,0.230476,0.000000,0.000000,0.000000,0.000000,0.159562,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.220967,0.220967,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.220967,0.152978,...,0.000000,0.220967,0.000000,0.000000,0.0,0.000000,0.000000,0.181196,0.000000,0.000000
3,0.000000,0.000000,0.0,0.000000,0.000000,0.175828,0.175828,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.175828,0.144181,0.000000,0.000000
4,0.000000,0.000000,0.0,0.211999,0.000000,0.000000,0.000000,0.211999,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.211999,0.0,0.000000,0.000000,0.000000,0.000000,0.211999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.313937,0.000000,0.000000
996,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.497202,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
997,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
998,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.340820,0.000000,0.000000


## Please note that we haven't done stemming or lemmatization to reduce the features here!

### You can consider the following steps to make your structured dataset:

> Vectorize

> Remove the stopwords

> You can do Lemmatization or Stemming to reduce the features further

> You can also consider dropping the most frequent features or drop the ones which are very rare)