In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords


In [26]:
movies_data = pd.read_csv('IMDBDataset.csv')[0:3000]
movies_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [27]:
movies_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     3000 non-null   object
 1   sentiment  3000 non-null   object
dtypes: object(2)
memory usage: 47.0+ KB


In [28]:
movies_data.describe()

Unnamed: 0,review,sentiment
count,3000,3000
unique,3000,2
top,One of the other reviewers has mentioned that ...,positive
freq,1,1508


In [29]:
movies_data['review'].nunique()

3000

In [30]:
movies_data["review"].value_counts()

review
One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to 

In [31]:
movies_data.drop_duplicates(subset=['review'],inplace=True)

In [32]:
# remove HTML tags from the string
import re

def remove_html_tags(text):
    clean = re.sub(r'<[^>]+>', '', text)
    return clean

movies_data['review']= movies_data['review'].apply(lambda msg: remove_html_tags(msg))


In [33]:

movies_data['sentiment'] = movies_data['sentiment'].map({'positive': 1, 'negative': 0})
movies_data.head(4)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. The filming tec...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0


In [34]:
movies_data['Tokenized'] = [[word for word in document.lower().split() if word not in stopwords.words('english')]
         for document in movies_data['review']]

In [None]:
X=movies_data['Tokenized']
y= movies_data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)



In [39]:
X_train

2761    [film, actually, works, fairly, original, idea...
123     [ah, yes, 1980s, ,, time, reaganomics, sly, ,,...
1808    [group, young, filmmakers, virtually, budget, ...
2286    [jay, craven's, criminally, ignored, film, sob...
2147    [masayuki, suo,, directed, fine, film,, role.,...
                              ...                        
1638    [film's, kind, like, conan, barabarian,, sex,,...
1095    [immoral, reprehensible, piece, garbage,, doub...
1130    [chance, view, previous, film,, i've, read, po...
1294    [movie, masterpiece, human, emotions, experien...
860     [production, quite, surprise, me., absolutely,...
Name: Tokenized, Length: 2010, dtype: object

In [12]:
from gensim.models import Word2Vec

# Train Word2Vec on X_train only
word2vec_model = Word2Vec(X_train, vector_size=200, window=5, min_count=1, workers=4)

word2vec_model.build_vocab(X_train, update=True)
word2vec_model.train(X_train, total_examples=word2vec_model.corpus_count, epochs=word2vec_model.epochs)

(20070521, 20508405)

In [13]:

def get_average_word_vector(words, model):
    vectors = [model.wv[word] for word in words if word in model.wv]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [14]:

# Transform X_train and X_test into feature vectors
X_train_vectors = np.array([get_average_word_vector(words, word2vec_model) for words in X_train])
X_test_vectors = np.array([get_average_word_vector(words, word2vec_model) for words in X_test])

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Train the Random Forest Classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train_vectors, y_train)
y_pre = classifier.predict(X_test_vectors)


# Predict on the test set

In [23]:
print(classification_report(y_test,y_pre))

              precision    recall  f1-score   support

           0       0.83      0.79      0.81      8182
           1       0.80      0.84      0.82      8181

    accuracy                           0.82     16363
   macro avg       0.82      0.82      0.82     16363
weighted avg       0.82      0.82      0.82     16363



In [41]:
# how to used the word2vec model 

from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
sentence = "Word embedding is an interesting concept in natural language processing."
# Tokenize the sentence
tokenized_sentence = word_tokenize(sentence.lower())
print(tokenized_sentence)
# Create Word2Vec model
word2vec_model = Word2Vec([tokenized_sentence], vector_size=50, window=5, min_count=1, workers=4)
# Get the word vectors
word_vectors = word2vec_model.wv
print("Word Vectors:")
print(word_vectors['embedding'])
# Similarity between words
similarity = word_vectors.similarity('word', 'embedding')
print(f"Similarity between 'word' and 'embedding': {similarity}")

['word', 'embedding', 'is', 'an', 'interesting', 'concept', 'in', 'natural', 'language', 'processing', '.']
Word Vectors:
[ 0.00805032  0.00869828  0.01991465 -0.00894801 -0.00277418 -0.01463869
 -0.01939531 -0.01816622 -0.00204768 -0.01300286  0.00969965 -0.01232923
  0.00503666  0.00147484 -0.00678588 -0.00195731  0.01995995  0.0182968
 -0.00892614  0.01816928 -0.01128177  0.01186771 -0.00619591  0.00685921
  0.00603751  0.01380274 -0.00474421  0.01755155  0.01517907 -0.01910217
 -0.01601864 -0.01527592  0.00585011 -0.0055862  -0.01386368 -0.01625729
  0.01662351  0.00397879 -0.01865978 -0.00958276  0.00627506 -0.00942734
  0.01056474 -0.00846769  0.00528943 -0.01609037  0.01241625  0.00963498
  0.00157505  0.00603105]
Similarity between 'word' and 'embedding': -0.1122526004910469
