In [1]:
#import required libraries
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize 
import numpy as np
import re
import pandas as pd
import itertools

#### 1. Compile a list of static links (permalinks) to individual user movie reviews from one particular website. This will be your working dataset for this assignment, as well as for assignments 7 and 8.  

a.	It does not matter if you use a crawler or if you manually collect the links, but you will need at least 100 movie review links. Note that, as of this writing, the robots.txt file of IMDB.com allows the crawling of user reviews.

b.	Each link should be to a web page that has only one user review of only one movie, e.g., the user review permalinks on the IMDB site.

c.	Choose reviews of movies that are all in the same genre, e.g., sci-fi, mystery, romance, superhero, etc.  

d.	Make sure your collection includes reviews of several movies in your chosen genre and that it includes a mix of negative and positive reviews.

In [2]:
#IMDB website URL
base_url = "https://www.imdb.com"

# API call to select:100 feature films which are atleast rated 4 with 50,000 votes in thriller genre sorted by rating
url = '''https://www.imdb.com/search/title/?title_type=feature&user_rating=4.0,10.0
&num_votes=50000,&genres=thriller&view=simple&sort=user_rating,desc&count=100'''

# Convert IMDB url to a BeautifulSoup object
response = requests.get(url)
movies_soup = BeautifulSoup(response.text, 'html.parser')

# get movie tags 
movie_tags = movies_soup.find_all('a', attrs={'class': None})

# filter the anchor-tags to get the titles of feature films
movie_tags = [tag.attrs['href'] for tag in movie_tags 
                  if tag.attrs['href'].startswith('/title') & tag.attrs['href'].endswith('/')]

# remove duplicate links
movie_tags = list(dict.fromkeys(movie_tags))

# Print out the number of reviews we have and show the first 5 items
print("There are a total of " + str(len(movie_tags)) + " movie user reviews")
print("Displaying first 5 user reviews links")
movie_tags[:5]

There are a total of 100 movie user reviews
Displaying first 5 user reviews links


['/title/tt0468569/',
 '/title/tt1375666/',
 '/title/tt6751668/',
 '/title/tt0114369/',
 '/title/tt0102926/']

In [3]:
# build out the list of reviews
review_links = [base_url + tag + 'reviews' for tag in movie_tags]

print("There are a total of " + str(len(review_links)) + " movie user reviews")
print("Displaying first 5 user reviews full url")
review_links[:5]

There are a total of 100 movie user reviews
Displaying first 5 user reviews full url


['https://www.imdb.com/title/tt0468569/reviews',
 'https://www.imdb.com/title/tt1375666/reviews',
 'https://www.imdb.com/title/tt6751668/reviews',
 'https://www.imdb.com/title/tt0114369/reviews',
 'https://www.imdb.com/title/tt0102926/reviews']

In [4]:
# get a list of soup objects
movie_soups = []
for link in review_links:
    response = requests.get(link)
    soup = BeautifulSoup(response.text, 'html.parser')
    movie_soups.append(soup)


In [5]:
# get a list movie review soup objects
movie_review_list = []
for movie_soup in movie_soups:
    # get a list of user ratings
    user_review_ratings = [tag.previous_element for tag in 
                           movie_soup.find_all('span', attrs={'class': 'point-scale'})]
    
    # find the index of negative and positive review, least user rating is considered as negative review and highest user rating is considered as positive review
    n_index = list(map(int, user_review_ratings)).index(min(list(map(int, user_review_ratings))))
    p_index = list(map(int, user_review_ratings)).index(max(list(map(int, user_review_ratings))))
    
    # get the review tags
    user_review_list = movie_soup.find_all('a', attrs={'class':'title'})
    
    # get the negative and positive review tags
    n_review_tag = user_review_list[n_index]
    p_review_tag = user_review_list[p_index]
    
    # return the negative and positive review link
    n_review_link = base_url + n_review_tag['href']
    p_review_link = base_url + p_review_tag['href']
    
    movie_review_list.append(n_review_link)
    movie_review_list.append(p_review_link)

movie_review_list[:5]

['https://www.imdb.com/review/rw1945777/',
 'https://www.imdb.com/review/rw5443093/',
 'https://www.imdb.com/review/rw2365579/',
 'https://www.imdb.com/review/rw2879376/',
 'https://www.imdb.com/review/rw5204791/']

#### 2.	Extract noun phrase (NP) chunks from your reviews using the following procedure:

a.	In Python, use BeautifulSoup to grab the main review text from each link.  

b.	Next run each review text through a tokenizer, and then try to NP-chunk it with a shallow parser. 

c.	You probably will have too many unknown words, owing to proper names of characters, actors, and so on that are not in your working dictionary. Make sure the main names that are relevant to the movies in your collection of reviews are added to the working lexicon, and then run the NP chunker again.


In [6]:
# get review text from the review link
review_texts = []
for url in movie_review_list:
    # get the review_url's soup
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # find div tags with class text show-more__control
    tag = soup.find('div', attrs={'class': 'text show-more__control'})
    review_texts.append(tag.getText())

review_texts[:5]

["The first film in the re-imagining of the series was a big hit, but this sequel was a global success, especially with the superb performance by the star of Brokeback Mountain who tragically died from a (prescribed) drugs overdose shortly after filming had finished, from director Christopher Nolan (Memento, Insomnia). Basically a criminal terrorist and mastermind calling himself the Joker (posthumous Oscar, BAFTA and Golden Globe winning Heath Ledger) robs the bank run by the mob, and to take on the Mafia district attorney Harvey Dent (Aaron Eckhart) becomes the new face for justice and hope in Gotham City, with the help of Batman aka Bruce Wayne (Christian Bale) and Lieutenant James 'Jim' Gordon (Gary Oldman). Mob bosses Sal Maroni (Eric Roberts), Gambol (Michael Jai White) and the Chechen (Ritchie Coaster), who have had Chinese accountant Lau (Chin Han) hide their funds, are confronted by the Joker because he wants to kill the, but they all refuse to help, putting a bounty on him. T

In [7]:
# get movie name from the review link
movie_titles = []
for url in movie_review_list:
    # get the review_url's soup
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # find div tags with class text show-more__control
    tag = soup.find('h1')
    movie_titles.append(list(tag.children)[1].getText())

movie_titles[:5]

['The Dark Knight', 'The Dark Knight', 'Inception', 'Inception', 'Parasite']

In [8]:
# label each review with negative or positive
review_sentiment = np.array(['negative', 'positive'] * (len(movie_review_list)//2))

In [9]:
# construct a dataframe
df = pd.DataFrame({'movie': movie_titles, 'user_review_permalink': movie_review_list,
             'user_review': review_texts, 'sentiment': review_sentiment})

# show the dataframe head to show what we return back
df.head()

Unnamed: 0,movie,user_review_permalink,user_review,sentiment
0,The Dark Knight,https://www.imdb.com/review/rw1945777/,The first film in the re-imagining of the seri...,negative
1,The Dark Knight,https://www.imdb.com/review/rw5443093/,After Batman Begins. Director Christopher Nola...,positive
2,Inception,https://www.imdb.com/review/rw2365579/,"As I type this, ""Inception"" is sitting at #6 o...",negative
3,Inception,https://www.imdb.com/review/rw2879376/,This is a world where people can go into your ...,positive
4,Parasite,https://www.imdb.com/review/rw5204791/,"I was able to see ""Parasite"" a few days ago at...",negative


In [10]:
# use an initial grammar regular expression that looks at sequences of proper nouns, nouns following each other and determiner/possessive, adjectives and noun.
grammar = r"""
     NP: {<DT|PP\$>?<JJ>*<NN>}  
         {<NNP>+}               
         {<NN><NN>}               
    """
user_review_chunks = []
for user_review in review_texts:
    user_review_ch = []
    sentences = nltk.sent_tokenize(user_review)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    for sent in sentences:
        nps = []
        cp = nltk.RegexpParser(grammar)
        tree = cp.parse(sent)
           
        # loop through the trees produced and pull out only the 
        # NP subtrees
        for subtree in tree.subtrees():
            if subtree.label() == 'NP':
                t = subtree
                t = ' '.join(word for word, tag in t.leaves())
                nps.append(t)
        user_review_ch.append(nps)
        
    user_review_chunks.append(user_review_ch)    

df['user_review_chunks'] = user_review_chunks
df.head()

Unnamed: 0,movie,user_review_permalink,user_review,sentiment,user_review_chunks
0,The Dark Knight,https://www.imdb.com/review/rw1945777/,The first film in the re-imagining of the seri...,negative,"[[The first film, the re-imagining, the series..."
1,The Dark Knight,https://www.imdb.com/review/rw5443093/,After Batman Begins. Director Christopher Nola...,positive,"[[Batman Begins], [Director Christopher Nolan,..."
2,Inception,https://www.imdb.com/review/rw2365579/,"As I type this, ""Inception"" is sitting at #6 o...",negative,"[[Inception, IMDb, a firm, voting, average], [..."
3,Inception,https://www.imdb.com/review/rw2879376/,This is a world where people can go into your ...,positive,"[[a world], [], [Dom Cobb, Leonardo DiCaprio],..."
4,Parasite,https://www.imdb.com/review/rw5204791/,"I was able to see ""Parasite"" a few days ago at...",negative,"[[Parasite, Philadelphia Film Festival, the fi..."


In [11]:
grammar = """
    NP:    {<DT><WP><VBP>*<RB>*<VBN><IN><NN>}
           {<NN|NNS|NNP|NNPS><IN>*<NN|NNS|NNP|NNPS>+}
           {<JJ>*<NN|NNS|NNP|NNPS><CC>*<NN|NNS|NNP|NNPS>+}
           {<JJ>*<NN|NNS|NNP|NNPS>+}
    """   

user_review_chunks2 = []
for user_review in review_texts:
    user_review_ch = []
    sentences = nltk.sent_tokenize(user_review)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    for sent in sentences:
        nps = []
        cp = nltk.RegexpParser(grammar)
        tree = cp.parse(sent)
           
        # loop through the trees produced and pull out only the 
        # NP subtrees
        for subtree in tree.subtrees():
            if subtree.label() == 'NP':
                t = subtree
                t = ' '.join(word for word, tag in t.leaves())
                nps.append(t)
        user_review_ch.append(nps)
        
    user_review_chunks2.append(user_review_ch)   

df['user_review_chunks2'] = user_review_chunks2
df.head()

Unnamed: 0,movie,user_review_permalink,user_review,sentiment,user_review_chunks,user_review_chunks2
0,The Dark Knight,https://www.imdb.com/review/rw1945777/,The first film in the re-imagining of the seri...,negative,"[[The first film, the re-imagining, the series...","[[first film, re-imagining, series, big hit, s..."
1,The Dark Knight,https://www.imdb.com/review/rw5443093/,After Batman Begins. Director Christopher Nola...,positive,"[[Batman Begins], [Director Christopher Nolan,...","[[Batman Begins], [Director Christopher Nolan,..."
2,Inception,https://www.imdb.com/review/rw2365579/,"As I type this, ""Inception"" is sitting at #6 o...",negative,"[[Inception, IMDb, a firm, voting, average], [...","[[Inception, IMDb, firm, voting average], [], ..."
3,Inception,https://www.imdb.com/review/rw2879376/,This is a world where people can go into your ...,positive,"[[a world], [], [Dom Cobb, Leonardo DiCaprio],...","[[world, people, dreams], [dreams, actions], [..."
4,Parasite,https://www.imdb.com/review/rw5204791/,"I was able to see ""Parasite"" a few days ago at...",negative,"[[Parasite, Philadelphia Film Festival, the fi...","[[Parasite, few days, Philadelphia Film Festiv..."


#### 3.	Output all the chunks in a single list for each review, and submit that output for this assignment. Also submit a brief written summary of what you did (describe your selection of genre, your source of reviews, how many you collected, and by what means).

#### Summary

- Using IMDB as the source pulled 100 movies to gather reviews through a scraping mechanism. 
- The movie selection was based off of movies in the thriller genre. 
- For each of the movies gathered a positive and negative review based on the users provided rating value, able to pull 200 total reviews to analyze.
- Each of the reviews was then run through a NP-chunker with shallow parsing. First used the regular expression parser and ran each review through twice with differing levels of regular expression requirements