In [1]:
import pandas as pd
import numpy as np

In [2]:
train_data = pd.read_csv('/home/aljebra/GenerativeAI Projects/python-codes/NLP/vectorization/(bow)labeledTrainData.csv', header=0, delimiter="\t", quoting=3)

In [3]:
train_data.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


### Data cleaning and preprocessing

In [4]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/aljebra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
train_data.columns

Index(['id', 'sentiment', 'review'], dtype='object')

In [6]:
train_data.drop(columns=['id'], inplace=True)

In [7]:
train_data.head()

Unnamed: 0,sentiment,review
0,1,"""With all this stuff going down at the moment ..."
1,1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,0,"""The film starts with a manager (Nicholas Bell..."
3,0,"""It must be assumed that those who praised thi..."
4,1,"""Superbly trashy and wondrously unpretentious ..."


In [8]:
train_data.columns

Index(['sentiment', 'review'], dtype='object')

In [9]:
# Define the fraction of the dataset to use (e.g., 10%)
fraction = 0.10 

# Calculate the number of rows to use
num_rows = int(len(train_data) * fraction)

# Select a random sample of rows
train_data = train_data.sample(n=num_rows, random_state=42) 

In [10]:
porter_stemmer = PorterStemmer()
processed_word = []

for sentence in train_data.index:
    review = re.sub('[^a-zA-Z]', " ", train_data.loc[sentence, 'review'])
    review = review.lower()
    review = review.split()
    stemmed_word = [porter_stemmer.stem(word) for word in review if word not in set(stopwords.words('english'))]
    processed_word.append(" ".join(stemmed_word))
    


In [11]:
processed_word

['read girl soup came peter seller low period watch movi surpris almost noth happen movi seemingli presenc seller goldi hawn help movi whole movi seem like randomli film whatev happen without script anyth mayb seen everi movi middl age elderli peopl tri hippi one give movi pretti bad name br br seller hawn star much better movi wast time pretti worthless',
 'film pull get go grab attent acknowledg yeah stori open clich funer br br hand judi given materi done great reunion famou pick one pleas team armi platoon theatr group singer band br br movi never stoop cheap sentiment think go swoop anoth direct case point flower sent admir judi br br band member interest group ride clich one jail one found religion one alki one sunk dementia joie de vivr rediscov judi ignit granddaught interest carri us along make us overlook sometim simplist natur plot br br cast talent lesli caron incompar jazzist cleo lain amaz high note last perform joan sim brava joan cute button flirtati ian holm ball olymp

### Create the bag of word model

In [12]:
len(processed_word)

2500

In [22]:
#create countVectorizer object
vectorizer = CountVectorizer(max_features=5000)

In [25]:
#now get my features for training the model
X = vectorizer.fit_transform(processed_word).toarray()

In [28]:
#view the most frequent words in the vocabulary using your vectorizer
vectorizer.vocabulary_

{'read': np.int64(3562),
 'girl': np.int64(1864),
 'came': np.int64(604),
 'peter': np.int64(3264),
 'seller': np.int64(3895),
 'low': np.int64(2663),
 'period': np.int64(3253),
 'watch': np.int64(4840),
 'movi': np.int64(2924),
 'surpris': np.int64(4346),
 'almost': np.int64(118),
 'noth': np.int64(3051),
 'happen': np.int64(2008),
 'seemingli': np.int64(3887),
 'presenc': np.int64(3396),
 'help': np.int64(2064),
 'whole': np.int64(4888),
 'seem': np.int64(3886),
 'like': np.int64(2605),
 'randomli': np.int64(3539),
 'film': np.int64(1660),
 'whatev': np.int64(4876),
 'without': np.int64(4924),
 'script': np.int64(3863),
 'anyth': np.int64(183),
 'mayb': np.int64(2763),
 'seen': np.int64(3888),
 'everi': np.int64(1506),
 'middl': np.int64(2824),
 'age': np.int64(78),
 'elderli': np.int64(1397),
 'peopl': np.int64(3243),
 'tri': np.int64(4588),
 'hippi': np.int64(2096),
 'one': np.int64(3109),
 'give': np.int64(1866),
 'pretti': np.int64(3406),
 'bad': np.int64(305),
 'name': np.int64(

In [None]:
X.shape

In [None]:
X[:10]

### n_gram implementation

In [35]:
#create countVectorizer object
vectorizer = CountVectorizer(max_features=100, ngram_range=(2, 3))

In [36]:
X = vectorizer.fit_transform(processed_word)

In [37]:
#view the most frequent words in the vocabulary using your vectorizer
vectorizer.vocabulary_

{'watch movi': np.int64(93),
 'seem like': np.int64(83),
 'br br': np.int64(3),
 'much better': np.int64(66),
 'wast time': np.int64(91),
 'br movi': np.int64(16),
 'br br movi': np.int64(8),
 'film like': np.int64(30),
 'saw movi': np.int64(79),
 'realli bad': np.int64(77),
 'seen movi': np.int64(84),
 'year old': np.int64(99),
 'movi one': np.int64(62),
 'movi watch': np.int64(64),
 'realli good': np.int64(78),
 'want see': np.int64(90),
 'one thing': np.int64(71),
 'see movi': np.int64(82),
 'bad movi': np.int64(1),
 'end film': np.int64(21),
 'stori line': np.int64(86),
 'br film': np.int64(13),
 'br br film': np.int64(5),
 'one best': np.int64(70),
 'pretti good': np.int64(73),
 'go see': np.int64(35),
 'good movi': np.int64(37),
 'low budget': np.int64(48),
 'horror movi': np.int64(42),
 'movi like': np.int64(60),
 'first time': np.int64(34),
 'year ago': np.int64(97),
 'movi br': np.int64(54),
 'movi br br': np.int64(55),
 'br one': np.int64(17),
 'look like': np.int64(45),
 'br

In [38]:
X.shape

(2500, 100)