## Data by Sentence

In [None]:
import nltk 
nltk.download('punkt')
import re
import numpy as np

text = """This movie is very scary and long.
This movie is not scary and is slow.
This movie is spooky ## and good."""

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# to keep every sentance sepeately
data_by_sent = nltk.sent_tokenize(text)

In [None]:
data_by_sent

['This movie is very scary and long.',
 'This movie is not scary and is slow.',
 'This movie is spooky ## and good.']

## Cleaning

In [None]:
def cleaner(text:str)->str:
  # remove non alpha numeric 
  regex = regex = re.compile(r"[\W]")
  clean_data = regex.sub(" ",text)
  # remove space in the end
  regex = regex = re.compile(r"[ \t]+$")
  clean_data = regex.sub("",clean_data)
  # replace two or more white spaces with one space
  regex = regex = re.compile(r"  +")
  clean_data = regex.sub(" ",clean_data)

  # convert to lower case
  clean_data = clean_data.lower()
  return clean_data

In [None]:
# apply cleaning, sentence by sentence
data_by_sent = [cleaner(i) for i in data_by_sent]

In [None]:
data_by_sent

['this movie is very scary and long',
 'this movie is not scary and is slow',
 'this movie is spooky and good']

## Create a Vocab

In [None]:
# to keepp the vocab, we need word tokens
# word tokenize
def make_word_tokens(text:str)->str:
  return nltk.word_tokenize(text)

In [None]:
data_by_words = [make_word_tokens(i) for i in data_by_sent]

In [None]:
data_by_words

[['this', 'movie', 'is', 'very', 'scary', 'and', 'long'],
 ['this', 'movie', 'is', 'not', 'scary', 'and', 'is', 'slow'],
 ['this', 'movie', 'is', 'spooky', 'and', 'good']]

In [None]:
# but we need a single list (flat list) to have a vocab
# we can use chain and unpack method
import itertools
data_by_words = list(itertools.chain(*data_by_words)) # packing/unpacking

In [None]:
data_by_words

['this',
 'movie',
 'is',
 'very',
 'scary',
 'and',
 'long',
 'this',
 'movie',
 'is',
 'not',
 'scary',
 'and',
 'is',
 'slow',
 'this',
 'movie',
 'is',
 'spooky',
 'and',
 'good']

In [None]:
# generate unique list(vocabluary)
vocab = list(set(data_by_words))
print(f"there are {len(vocab)} unique words in the text")

there are 11 unique words in the text


In [None]:
vocab

['very',
 'this',
 'long',
 'is',
 'scary',
 'and',
 'not',
 'slow',
 'good',
 'movie',
 'spooky']

## Generate count of words

In [None]:
# generate the count of words
from collections import Counter
word_counts = Counter(data_by_words)

In [None]:
word_counts

Counter({'and': 3,
         'good': 1,
         'is': 4,
         'long': 1,
         'movie': 3,
         'not': 1,
         'scary': 2,
         'slow': 1,
         'spooky': 1,
         'this': 3,
         'very': 1})

## Create empty matrix

In [None]:
import numpy as np
reviews = len(data_by_sent)
columns = len(vocab)
matrix = np.zeros(shape=(reviews,columns)) # rows = number of reviews , column = vocab

In [None]:
matrix

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

## Apply One Hot Encoding

In [None]:
import pdb
for review in range(reviews):
  # pdb.set_trace()
  sentence_vector = []
  for word in vocab:
    if word in data_by_sent[review]:
      sentence_vector.append(1)
    else:
      sentence_vector.append(0)
  matrix[review] = sentence_vector

In [None]:
# for visual representation, convert to pandas 
import pandas as pd
data_pd = pd.DataFrame(matrix)
data_pd.columns= vocab

In [None]:
data_pd

Unnamed: 0,very,this,long,is,scary,and,not,slow,good,movie,spooky
0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
2,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0


## Get Most Frequent Words

In [None]:
# grt thr most frequent woeds
word_counts.most_common(3)

[('is', 4), ('this', 3), ('movie', 3)]

## TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'This is a good movie.',
    'It is a good movie, but you know good is relative.',
    'Movie is fun to watch.',
    'I had a good relaxing time.',
    'The whole cinema experience was good.',
    'This is a good cinema.' ,
]


In [None]:
# init the tfidf vactorizer
vectorizer = TfidfVectorizer(stop_words='english')
# transform it
X = vectorizer.fit_transform(corpus)

In [None]:
# sneak peak at the features -> vocab
vectorizer.get_feature_names_out()

array(['cinema', 'experience', 'fun', 'good', 'know', 'movie', 'relative',
       'relaxing', 'time', 'watch'], dtype=object)

In [None]:
# create a pandas data frame
import pandas as pd
pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names_out())

Unnamed: 0,cinema,experience,fun,good,know,movie,relative,relaxing,time,watch
0,0.0,0.0,0.0,0.594855,0.0,0.803833,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.545429,0.532306,0.368522,0.532306,0.0,0.0,0.0
2,0.0,0.0,0.635091,0.0,0.0,0.439681,0.0,0.0,0.0,0.635091
3,0.0,0.0,0.0,0.340608,0.0,0.0,0.0,0.664826,0.664826,0.0
4,0.589511,0.718903,0.0,0.368313,0.0,0.0,0.0,0.0,0.0,0.0
5,0.848083,0.0,0.0,0.529863,0.0,0.0,0.0,0.0,0.0,0.0


###### What happens when we get OOV words ?

In [None]:
test = ["this is the nth document and fahad hloo molly"]

In [None]:
tX = vectorizer.transform(test)

In [None]:
# create a pandas data frame
pd.DataFrame(tX.toarray(),columns=vectorizer.get_feature_names_out())

Unnamed: 0,cinema,experience,fun,good,know,movie,relative,relaxing,time,watch
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
vectorizer.idf_

array([1.84729786, 2.25276297, 2.25276297, 1.15415068, 2.25276297,
       1.55961579, 2.25276297, 2.25276297, 2.25276297, 2.25276297])