In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

imdb_review_df = pd.read_csv('Data/IMDB Dataset.csv')
imdb_review_df.head()

In [None]:
# Find the number of positive and negative reviews
print('Number of positive and negative reviews: ', imdb_review_df['sentiment'].value_counts())
# Find the proportion of positive and negative reviews
print('Proportion of positive and negative reviews: ', imdb_review_df['sentiment'].value_counts() / len(imdb_review_df))

In [None]:
length_reviews = imdb_review_df['review'].str.len()

# How long is the longest review?
print(f"There are {len(length_reviews)} reviews in the dataframe.")
print(f"The longest review is {max(length_reviews)} characters long.")

In [None]:
from textblob import TextBlob

def get_sentiment(review):
    sentiment = TextBlob(review).sentiment
    return sentiment

imdb_review_polarity = imdb_review_df['review'].apply(get_sentiment)
imdb_review_polarity.head()

In [None]:
print(f"Sentiment of first review: {imdb_review_polarity[0]}")
print(f"Sentiment of second review: {imdb_review_polarity[1]}")
print(f"Sentiment of last review: {imdb_review_polarity.iloc[-1]}")

In [None]:
longest_review = max(imdb_review_df['review'], key=len)
print(longest_review)
print('\n')
longest_review_blob = TextBlob(longest_review)
print(f"Sentiment of the longest film review: {longest_review_blob.sentiment}.")

In [None]:
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords

longest_review_wc = WordCloud(
    background_color='white', 
    stopwords=set(stopwords.words('english'))
    ).generate(longest_review)

plt.imshow(longest_review_wc, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

vectorizer.fit([imdb_review_df['review'][0]])
first_review_bow = vectorizer.transform([imdb_review_df['review'][0]])
print(first_review_bow.toarray())

In [None]:
reviews_vect = CountVectorizer(
    max_features=1000, 
    ngram_range=(1,2), 
    max_df=500, min_df=25, 
    stop_words='english',
    token_pattern=r'\b[^\d\W][^\d\W]+\b'
    )

reviews_vect.fit(imdb_review_df['review'])

X_reviews = reviews_vect.transform(imdb_review_df['review'])
X_count_df = pd.DataFrame(X_reviews.toarray(), columns=reviews_vect.get_feature_names_out())
print(X_count_df.head())

In [None]:
print('Length of vectorizer: ', reviews_vect.get_feature_names_out())

In [None]:
from nltk import word_tokenize

print(word_tokenize(imdb_review_df['review'][0]))

In [None]:
reviews_tokens = [word_tokenize(review) for review in imdb_review_df['review']]
print(reviews_tokens[1])

In [None]:
cleaned_tokens = [[word for word in review if word.isalpha()] for review in reviews_tokens]
print(cleaned_tokens[1])

In [None]:
"""
english_stopwords = set(stopwords.words('english'))

reviews_wc = WordCloud(background_color='white', stopwords=english_stopwords).generate(all_reviews_text)
plt.imshow(reviews_wc, interpolation='bilinear')
plt.axis('off')
plt.show()"""

"""# Create a concatenated string of all reviews
all_reviews_text = ' '.join(imdb_review_df['review'])

# Tokenize the concatenated text
tokenized_reviews = word_tokenize(all_reviews_text)

# Print the tokenized reviews
print(tokenized_reviews)"""

In [None]:
from nltk.stem import WordNetLemmatizer

WNLemmatizer = WordNetLemmatizer()

lem_tokens = [WNLemmatizer.lemmatize(token) for sublist in cleaned_tokens for token in sublist]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_model = TfidfVectorizer(
    max_features=1000, 
    ngram_range=(1,2), 
    max_df=500, min_df=25, 
    stop_words='english',
    token_pattern=r'\b[^\d\W][^\d\W]+\b'
)

reviews_tfidf = tfidf_model.fit_transform(imdb_review_df['review'])

X_tfidf = pd.DataFrame(reviews_tfidf.toarray(), columns=tfidf_model.get_feature_names_out())
print(X_tfidf.head())

In [None]:
print('Top 5 rows using BOW: \n', X_count_df.head(5))
print('Top 5 rows using TF-IDF: \n', X_tfidf.head(5))