# Natural Language Processing: Marvel's Rotten Tomatoes Reviews

## Importing Libraries

In [1]:
# web scraping
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
PATH = 'C:\Program Files (x86)\chromedriver.exe'

# data manipulation
import pandas as pd
import numpy as np

# strings processing
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# machine learning
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from statistics import mean
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\artur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Web Scraping with BeautifulSoup and Selenium

In [2]:
def reviews_on_page(url, reviews, scores):
    pageTree = requests.get(url, headers=headers)
    pageSoup = BeautifulSoup(pageTree.content, 'html.parser')
    all_reviews = pageSoup.find_all('div', class_='row review_table_row')
    for review in all_reviews:
        review_text = review.find('div', class_='the_review').text
        review_text = review_text.strip()
        reviews.append(review_text)
        score = review.find('div', class_='col-xs-16 review_container').contents[1]['class'][3]
        scores.append(score)

In [3]:
def reviews_on_page_html(html, movie, movies, reviews, scores):
    pageSoup = BeautifulSoup(html, 'html.parser')
    all_reviews = pageSoup.find_all('div', class_='row review_table_row')
    for review in all_reviews:
        movies.append(movie)
        review_text = review.find('div', class_='the_review').text
        review_text = review_text.strip()
        reviews.append(review_text)
        score = review.find('div', class_='col-xs-16 review_container').contents[1]['class'][3]
        scores.append(score)

In [4]:
def all_movie_reviews(html, movie, movies, reviews, scores):
    ser = Service(PATH)
    op = webdriver.ChromeOptions()
    driver = webdriver.Chrome(service=ser, options=op)
    cont = True
    driver.get(html)
    while cont:
        try:
            WebDriverWait(driver,2).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="content"]/div/div/div/nav[2]/button[2]'))).click()
            time.sleep(2)
            html = driver.page_source
            reviews_on_page_html(html, movie, movies, reviews, scores)
        except:
            cont = False

In [5]:
def get_reviews():
   reviews = []
   scores = []
   movies = []
   all_movies_link = 'https://editorial.rottentomatoes.com/guide/marvel-movies-in-order/'
   headers = {'User-Agent': 
           'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}
   pageTree = requests.get(all_movies_link, headers=headers)
   pageSoup = BeautifulSoup(pageTree.content, 'html.parser')
   all_movies = pageSoup.find_all('h2')
   all_movies = all_movies[0:26]
    
   for item in all_movies:
      movie_link = item.find('a')['href']
      movie = item.find('a').text
      reviews_link = movie_link + '/reviews'
      all_movie_reviews(reviews_link, movie, movies, reviews, scores)

   return reviews, scores, movies

In [6]:
#reviews, scores, movies = get_reviews()
#dataframe = pd.DataFrame({'movie': movies, 'review': reviews, 'score': scores})
#dataframe.to_csv('reviews.csv', index=False)

In [7]:
dataframe = pd.read_csv('reviews.csv')

## Exploratory Data Analysis

In [8]:
dataframe.head()

Unnamed: 0,movie,review,score
0,Captain America: The First Avenger,It's the big hit Marvel needed to be and a str...,fresh
1,Captain America: The First Avenger,Captain America is a truly humble story that f...,fresh
2,Captain America: The First Avenger,Director Joe Johnston tries to inject some pul...,rotten
3,Captain America: The First Avenger,Though it's never much of a question who'll sa...,fresh
4,Captain America: The First Avenger,Captain America's a superhero due to a super s...,fresh


In [9]:
dataframe.tail()

Unnamed: 0,movie,review,score
9355,Eternals,Eternals is beautifully shot and terrifically ...,fresh
9356,Eternals,Director Chlo Zhao's entry into the superhero ...,fresh
9357,Eternals,While Eternals has most of the benchmarks of a...,fresh
9358,Eternals,Unlike anything ever seen in the MCU before. T...,fresh
9359,Eternals,"Setting aside whatever faults I found, Eternal...",fresh


In [10]:
dataframe.groupby('movie').count()

Unnamed: 0_level_0,review,score
movie,Unnamed: 1_level_1,Unnamed: 2_level_1
Ant-Man,316,316
Ant-Man and The Wasp,419,419
Avengers: Age of Ultron,355,355
Avengers: Endgame,527,527
Avengers: Infinity War,465,465
Black Panther,505,505
Black Widow,417,417
Captain America: Civil War,406,406
Captain America: The First Avenger,254,254
Captain America: The Winter Soldier,286,286


In [11]:
dataframe = dataframe.replace({'score': {'fresh': 1, 'rotten': 0}})
dataframe.head()

Unnamed: 0,movie,review,score
0,Captain America: The First Avenger,It's the big hit Marvel needed to be and a str...,1
1,Captain America: The First Avenger,Captain America is a truly humble story that f...,1
2,Captain America: The First Avenger,Director Joe Johnston tries to inject some pul...,0
3,Captain America: The First Avenger,Though it's never much of a question who'll sa...,1
4,Captain America: The First Avenger,Captain America's a superhero due to a super s...,1


In [12]:
dataframe.groupby('movie').sum('scores').sort_values(by='score', ascending=False)

Unnamed: 0_level_0,score
movie,Unnamed: 1_level_1
Avengers: Endgame,492
Black Panther,487
Captain Marvel,411
Avengers: Infinity War,394
Spider-Man: Far From Home,389
Thor: Ragnarok,388
Captain America: Civil War,370
Ant-Man and The Wasp,368
Spider-Man: Homecoming,345
Guardians of the Galaxy Vol. 2,340


In [13]:
dataframe.groupby('movie').mean('scores').sort_values(by='score', ascending=False)

Unnamed: 0_level_0,score
movie,Unnamed: 1_level_1
Black Panther,0.964356
Iron Man,0.934866
Avengers: Endgame,0.933586
Thor: Ragnarok,0.930456
Spider-Man: Homecoming,0.92246
Marvel's the Avengers,0.915205
Shang-Chi and the Legend of the Ten Rings,0.912752
Captain America: Civil War,0.91133
Guardians of the Galaxy,0.910828
Spider-Man: Far From Home,0.902552


In [14]:
dataframe.groupby('score').count().drop('movie', axis=1)

Unnamed: 0_level_0,review
score,Unnamed: 1_level_1
0,1472
1,7888


In [15]:
sampler = RandomUnderSampler(sampling_strategy='majority')
X, y = sampler.fit_resample(dataframe.drop('score', axis=1).drop('movie', axis=1), dataframe['score'])
print(X.shape)
print(y.shape)

(2944, 1)
(2944,)


In [16]:
y.value_counts()

0    1472
1    1472
Name: score, dtype: int64

## Data Cleanup

In [17]:
for review in X['review']:
    if review.find('[') != -1:
        print(review)

[A] so-so experience.
[The hero's] transformation from commonplace but with personality to bulky but generally nondescript echoes the movie's own shift.
[A] hokey, hacky, two-hour-plus exercise in franchise transition/price gouging, complete with utterly unnecessary post-converted 3-D.
Sometimes it stumbles upon the dullest components of a generic formula that is at the service of nonsense. [Full review in Spanish]
A very expository script, that makes use of flashbacks, with a poor representation of a period, telling a story that is not exactly memorable. [Full Review in Spanish]
The film stands only as one more step to Endgame and nothing more. [Full Review in Spanish]
Captain Marvel is an unremarkable, passable time killer...[Brie] Larson's performance is wooden...while the film's big action scenes are so lacking in imagination the screen often looks as though it's being continually doused in technicolour vomit.
A plain, lazy and insipid entry. [Full Review in Spanish]
It's the enter

In [18]:
for review in X['review']:
    last_bracket = review.rfind('[Full')
    if last_bracket != -1:
        review = review[:last_bracket-1]

In [19]:
corpus = []
all_stopwords = stopwords.words('english')
print(all_stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [20]:
not_stopwords = ['between', 'below', 'above', 'most', 'not', 'nor', 'too', 'very', 'don', "don't", 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

for word in not_stopwords:
    try:
        all_stopwords.remove(word)
    except:
        pass
print(all_stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'into', 'through', 'during', 'before', 'after', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'other', 'some', 'such', 'no', 'only', 'own', 'same', 'so', 'than', 's', 't', 'can', 'will', 'just', 'should', "should'v

In [21]:
for review in X['review']:
    print(review)

Director Joe Johnston tries to inject some pulpy '40s-style fizz into the proceeding, but the film's lead footed pace never seems to lighten up.
The best bit in this year's umpteenth comic book blockbuster, Captain America: The First Avenger, is when Tommy Lee Jones eats some steak.
Captain America: The First Avenger feels like a clichéd blast from the past, lacking the spunk and grit that has made other comic book movies successful.
A lifeless and clichéd action film that never lives up to its title or its title characters' ideals.
It's difficult to dislike "Captain America" too much, because it's harmless and innocuous and, unlike almost every other tentpole summer movie, isn't secretly trying to give the audience a seizure.
Steve Rogers (Chris Evans) wants to go to war. It's 1942, and he's a scrawny kid from Brooklyn, too scrawny, according to army doctors, to join up.
It serves its chief end to introduce the character, and Evans fills the role nicely. But as a stand-alone adventure

In [22]:
for review in X['review']:
    review = re.sub('[^a-zA-Z]', ' ', review)
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
y = y.values

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import cross_val_score
from statistics import mean

lr = LogisticRegression(solver='liblinear')
knn = KNN()
rf = RandomForestClassifier()
xgb = XGBClassifier(eval_metric='mlogloss')
classifiers = [('Logistic Regression', lr), ('K Nearest Neighbours', knn), ('XGB Classifier', xgb),('Random Forest', rf)]

In [26]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import cross_val_score
from statistics import mean
for cls_name, clf in classifiers:
    pipe = Pipeline(steps=[('classifier', clf)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    print(f'{clf} ')
    print(mean(cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=5)))

LogisticRegression(solver='liblinear') 
0.6974403193612775
KNeighborsClassifier() 
0.563940119760479
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              eval_metric='mlogloss', gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=16,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None) 
0.6838451097804391
RandomForestClassifier() 


In [None]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))