In [None]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import itertools
pd.options.display.max_colwidth=500

In [None]:
import requests
from bs4 import BeautifulSoup

def getSoup(url):
    """
    Utility function which takes a url and returns a Soup object.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    return soup

def minMax(a):
    '''Returns the index of negative and positive review.'''
    
    # get the index of least rated user review
    minpos = a.index(min(a))
    
    # get the index of highest rated user review
    maxpos = a.index(max(a))
    
    return minpos, maxpos

def getReviews(soup):
    '''Function returns a negative and positive review for each movie.'''
    
    # get a list of user ratings
    user_review_ratings = [tag.previous_element for tag in 
                           soup.find_all('span', attrs={'class': 'point-scale'})]
    
    
    # find the index of negative and positive review
    n_index, p_index = minMax(list(map(int, user_review_ratings)))
    
    
    # get the review tags
    user_review_list = soup.find_all('a', attrs={'class':'title'})
    
    
    # get the negative and positive review tags
    n_review_tag = user_review_list[n_index]
    p_review_tag = user_review_list[p_index]
    
    # return the negative and positive review link
    n_review_link = "https://www.imdb.com" + n_review_tag['href']
    p_review_link = "https://www.imdb.com" + p_review_tag['href']
    
    return n_review_link, p_review_link

def getReviewText(review_url):
    '''Returns the user review text given the review url.'''
    
    # get the review_url's soup
    soup = getSoup(review_url)
    
    # find div tags with class text show-more__control
    tag = soup.find('div', attrs={'class': 'text show-more__control'})
    
    return tag.getText()

def getMovieTitle(review_url):
    '''Returns the movie title from the review url.'''
    
    # get the review_url's soup
    soup = getSoup(review_url)
    
    # find h1 tag
    tag = soup.find('h1')
    
    return list(tag.children)[1].getText()

def getNounChunks(user_review):
    
    # create the doc object
    doc = nlp(user_review)
    
    # get a list of noun_chunks
    noun_chunks = list(doc.noun_chunks)
    
    # convert noun_chunks from span objects to strings, otherwise it won't pickle
    noun_chunks_strlist = [chunk.text for chunk in noun_chunks]
    
    return noun_chunks_strlist

In [None]:
# API call to select:
## feature films
## which are rated atleast 4.0
## having atleast 50,000 votes
## in the Thriller genre
## sorted by user rating
## limit to 250 movies
url = '''https://www.imdb.com/search/title/?title_type=feature&user_rating=4.0,&num_votes=10000,&countries=in&languages=hi&sort=user_rating,desc&count=250'''

# get the soup object for main api url
movies_soup = getSoup(url)


In [None]:
# find all a-tags with class:None
movie_tags = movies_soup.find_all('a', attrs={'class': None})

# filter the a-tags to get just the titles
movie_tags = [tag.attrs['href'] for tag in movie_tags 
              if tag.attrs['href'].startswith('/title') & tag.attrs['href'].endswith('/')]

# remove duplicate links
movie_tags = list(dict.fromkeys(movie_tags))

print("There are a total of " + str(len(movie_tags)) + " movie titles")
print("Displaying 10 titles")
movie_tags[:10]

There are a total of 250 movie titles
Displaying 10 titles


['/title/tt8110330/',
 '/title/tt0079221/',
 '/title/tt0400234/',
 '/title/tt0066763/',
 '/title/tt5074352/',
 '/title/tt1187043/',
 '/title/tt0986264/',
 '/title/tt0085743/',
 '/title/tt0072783/',
 '/title/tt8291224/']

In [None]:
# movie links
base_url = "https://www.imdb.com"
movie_links = [base_url + tag + 'reviews' for tag in movie_tags]
print("There are a total of " + str(len(movie_links)) + " movie user reviews")
print("Displaying 10 user reviews links")
movie_links[:10]

There are a total of 250 movie user reviews
Displaying 10 user reviews links


['https://www.imdb.com/title/tt8110330/reviews',
 'https://www.imdb.com/title/tt0079221/reviews',
 'https://www.imdb.com/title/tt0400234/reviews',
 'https://www.imdb.com/title/tt0066763/reviews',
 'https://www.imdb.com/title/tt5074352/reviews',
 'https://www.imdb.com/title/tt1187043/reviews',
 'https://www.imdb.com/title/tt0986264/reviews',
 'https://www.imdb.com/title/tt0085743/reviews',
 'https://www.imdb.com/title/tt0072783/reviews',
 'https://www.imdb.com/title/tt8291224/reviews']

In [None]:
# get a list of soup objects
movie_soups = [getSoup(link) for link in movie_links]

# get all 500 movie review links
movie_review_list = [getReviews(movie_soup) for movie_soup in movie_soups]

movie_review_list = list(itertools.chain(*movie_review_list))
print(len(movie_review_list))

print("There are a total of " + str(len(movie_review_list)) + " individual movie reviews")
print("Displaying 10 reviews")
movie_review_list[:10]

500
There are a total of 500 individual movie reviews
Displaying 10 reviews


['https://www.imdb.com/review/rw5930232/',
 'https://www.imdb.com/review/rw5930232/',
 'https://www.imdb.com/review/rw2492582/',
 'https://www.imdb.com/review/rw0167942/',
 'https://www.imdb.com/review/rw1618747/',
 'https://www.imdb.com/review/rw1079742/',
 'https://www.imdb.com/review/rw5742332/',
 'https://www.imdb.com/review/rw1021007/',
 'https://www.imdb.com/review/rw3604119/',
 'https://www.imdb.com/review/rw5504574/']

In [None]:
# get review text from the review link
review_texts = [getReviewText(url) for url in movie_review_list]

# get movie name from the review link
movie_titles = [getMovieTitle(url) for url in movie_review_list]

# label each review with negative or positive
review_sentiment = np.array(['negative', 'positive'] * (len(movie_review_list)//2))

# construct a dataframe
df = pd.DataFrame({'movie': movie_titles, 'user_review_permalink': movie_review_list,
             'user_review': review_texts, 'sentiment': review_sentiment})

In [None]:
data = df.copy()

In [None]:
data.head()

Unnamed: 0,movie,user_review_permalink,user_review,sentiment
0,Dil Bechara,https://www.imdb.com/review/rw5930232/,One of the finest acting I've ever seen.I suggest you to please watch this. I think this is far better than movies originated from nepotism. Shushant Sir has done great work and lived the moment in this film. Everything was perfect.,negative
1,Dil Bechara,https://www.imdb.com/review/rw5930232/,One of the finest acting I've ever seen.I suggest you to please watch this. I think this is far better than movies originated from nepotism. Shushant Sir has done great work and lived the moment in this film. Everything was perfect.,positive
2,Gol Maal,https://www.imdb.com/review/rw2492582/,"Hrishikesh Mukherjee directs another comedy movie involving simple middle-class people and presents it through their everyday struggle. The entire picture is simple, authentic and fun. The concept of a young guy who would go to any extent to not lose his job is very interesting, and Mukherjee handles it with great success. Gol Maal is a comedy of errors and it's wonderfully developed and narrated. It's not a complex story, but it's not a silly one either, and the simplicity works in its favo...",negative
3,Gol Maal,https://www.imdb.com/review/rw0167942/,This movie is one of the best movies that Hrishikesh Mukherjee gave. A must see for everybody. I don't have words to express how much I like this movie. And I am yet to meet a person who did not like it. Too good.,positive
4,Black Friday,https://www.imdb.com/review/rw1618747/,"I would give this film a 10 out of 10, but I don't think any film can be perfect. Anurag, you are a genius! This film is great. Not only is it a true story, but it has been shot in a very unique fashion...The flashbacks, etc make this film so unique. Also the red shots and the blue shots also give a feel of reality. The acting, cinematography, editing, and action was all top notch and made the film very realistic. This is not your everyday masala, popcorn flick but a very well made, thought ...",negative


In [None]:
p = list(df['movie'])
q = list(df['user_review_permalink'])
r = list(df['user_review'])
s = list(df['sentiment'])

In [None]:
final_p = []
final_q = []
final_r = []
final_s = []
count = 0
for i in range(0,len(p),2):
  if (count==0):
    final_p.append(p[i])
    final_q.append(q[i])
    final_r.append(r[i])
    final_s.append(s[i])
    count = count+1

  else:
    final_p.append(p[i+1])
    final_q.append(q[i+1])
    final_r.append(r[i+1])
    final_s.append(s[i+1])
    count = 0

  

In [None]:
len(final_p)

250

In [None]:
data1 =  {'movie':final_p , 'user_review_permalink':final_q,'user_review':final_r,'sentiment':final_s}
df2 = pd.DataFrame(data1) 

In [None]:
df2.head()

Unnamed: 0,movie,user_review_permalink,user_review,sentiment
0,Dil Bechara,https://www.imdb.com/review/rw5930232/,One of the finest acting I've ever seen.I suggest you to please watch this. I think this is far better than movies originated from nepotism. Shushant Sir has done great work and lived the moment in this film. Everything was perfect.,negative
1,Gol Maal,https://www.imdb.com/review/rw0167942/,This movie is one of the best movies that Hrishikesh Mukherjee gave. A must see for everybody. I don't have words to express how much I like this movie. And I am yet to meet a person who did not like it. Too good.,positive
2,Black Friday,https://www.imdb.com/review/rw1618747/,"I would give this film a 10 out of 10, but I don't think any film can be perfect. Anurag, you are a genius! This film is great. Not only is it a true story, but it has been shot in a very unique fashion...The flashbacks, etc make this film so unique. Also the red shots and the blue shots also give a feel of reality. The acting, cinematography, editing, and action was all top notch and made the film very realistic. This is not your everyday masala, popcorn flick but a very well made, thought ...",negative
3,Anand,https://www.imdb.com/review/rw1021007/,"What an outstanding movie!! I have heard all the prior generation people rave about this movie, so, I decided to check this movie out myself. I only have faint memories of having watched parts of this movie from my mom's lap when she and dad were watching this in the theater. The other reason why I decided to check this out was a Super-bowl half-time debate on whether Amitabh was better than Rajesh Khanna. I could not participate in this debate for two reasons: first, I was eagerly awaiting ...",positive
4,我和我的冠軍女兒,https://www.imdb.com/review/rw3604119/,"Mahavir Singh Phogat is a former champion wrestler. He is married to Shobha Kaur. Phogat's dream is to win a gold medal for India in wrestling. He is unable to do this. He promises that his son will do what he couldn't do. However, although he loves them dearly, Phogat is disappointed when Shobha gives birth to four daughters. He does not believe girls can wrestle well. However, when two of his daughters, Geeta and Babita, come home after beating up two boys, Phogat realises how wrong he has...",negative


In [None]:
df4 = df2[0:200]

In [None]:
df4

Unnamed: 0,movie,user_review_permalink,user_review,sentiment
0,Dil Bechara,https://www.imdb.com/review/rw5930232/,One of the finest acting I've ever seen.I suggest you to please watch this. I think this is far better than movies originated from nepotism. Shushant Sir has done great work and lived the moment in this film. Everything was perfect.,negative
1,Gol Maal,https://www.imdb.com/review/rw0167942/,This movie is one of the best movies that Hrishikesh Mukherjee gave. A must see for everybody. I don't have words to express how much I like this movie. And I am yet to meet a person who did not like it. Too good.,positive
2,Black Friday,https://www.imdb.com/review/rw1618747/,"I would give this film a 10 out of 10, but I don't think any film can be perfect. Anurag, you are a genius! This film is great. Not only is it a true story, but it has been shot in a very unique fashion...The flashbacks, etc make this film so unique. Also the red shots and the blue shots also give a feel of reality. The acting, cinematography, editing, and action was all top notch and made the film very realistic. This is not your everyday masala, popcorn flick but a very well made, thought ...",negative
3,Anand,https://www.imdb.com/review/rw1021007/,"What an outstanding movie!! I have heard all the prior generation people rave about this movie, so, I decided to check this movie out myself. I only have faint memories of having watched parts of this movie from my mom's lap when she and dad were watching this in the theater. The other reason why I decided to check this out was a Super-bowl half-time debate on whether Amitabh was better than Rajesh Khanna. I could not participate in this debate for two reasons: first, I was eagerly awaiting ...",positive
4,我和我的冠軍女兒,https://www.imdb.com/review/rw3604119/,"Mahavir Singh Phogat is a former champion wrestler. He is married to Shobha Kaur. Phogat's dream is to win a gold medal for India in wrestling. He is unable to do this. He promises that his son will do what he couldn't do. However, although he loves them dearly, Phogat is disappointed when Shobha gives birth to four daughters. He does not believe girls can wrestle well. However, when two of his daughters, Geeta and Babita, come home after beating up two boys, Phogat realises how wrong he has...",negative
...,...,...,...,...
195,Fan,https://www.imdb.com/review/rw3452793/,"I'll try not to be emotional in this, although as a Shah Rukh Khan fan I felt waves of emotions while watching the movie and after it ended it was an emotional moment to see the King Khan back with a blinding, thundering, stupendous performance for which we (the fans) have waited for so long. The idea of the movie was interesting, and the script has several plot holes, some of them, pretty big. Character consistency of Aryan Khanna the star is dubious. But.... what gives?The overall executio...",positive
196,Aashiqui 2,https://www.imdb.com/review/rw2810142/,Very misleading for today's youth. Love can conquer everything. Love gives you hope and faith to live life not to lose your life. Very slow and dragging movie. The only good point about movie is song and music. Acting is also good. But the message story conveys it's not appealing. It's not a movie one can remember forever like KAL HO NA HO or DDLJ. the story of success and failure of stardom is very old. Giving up life for your loved one is not really a solution. Must avoid.. don't waste you...,negative
197,孟買日記,https://www.imdb.com/review/rw2503167/,"What prompted me to write this review is the kind of negativity this movie has received from reviewers on IMDb. I understand that every review written here is an honest opinion, but the number of 1 out of 10 scores I have come across from people who also had time to write a review amazes me. I have taken time to read most of the reviews and I see that they were all disappointed with the movie not featuring Aamir over other actors. I have one thing to say to all of them, THIS MOVIE IS NOT ABO...",positive
198,Karthik Calling Karthik,https://www.imdb.com/review/rw2261395/,"After 'Luck by chance' Farhan Akhtar is back on the big screen in 'Karthik calling Karthik'. With this role it gave him the opportunity to play a character that was more simple, vulnerable and emotional. The role has many shades and is not one sided. As in the same vein as 'Don' the movie belongs to the suspense thriller genre. Many people may have seen movies like 'Phone booth' and 'When a stranger calls', so this will be very familiar territory to those. The movie revolves around the telep...",negative


In [None]:
df5 = df2[200:250]

In [None]:
y_true = df5['sentiment']

In [None]:
df5.head()

Unnamed: 0,movie,user_review_permalink,user_review,sentiment
200,Main Hoon Na,https://www.imdb.com/review/rw1409633/,"But what was up with Zayed Khan's hair? And who thought to cast Rakhi Sawant as a college sex symbol! She looked truly miscast in that role.But anyways, this was a wonderful film, I very well nearly laughed myself into a coma at the comic bits. I can't believe I postponed seeing this film for so long.Shah Rukh Khan (Ram) stars as an Army major who endures a personal tragedy at the beginning of the film, then is ordered to go undercover to protect the commissioner's daughter (played by Amrita...",negative
201,Pardes,https://www.imdb.com/review/rw0428420/,"This is an excellent film, Sharukh Khan gives an astounding performance as Arjun, Mahima Choudhary puts a lot of effort in to her character as the chemistry between Mahima and Sharukh shows throughout the film. This is a different film from the others as it concentrates on the differences between countries. The songs in the film are also meaningful and romantic. Overall, this film is one of my favourite films.",positive
202,Dil To Pagal Hai,https://www.imdb.com/review/rw0413181/,"I first saw the film in 1998, and I was never tired of seeing the film again and again. Karishma Kapoor as Nisha gave a first rate performance even though she played second fiddle to Madhuri Dixit (Pooja). The story revolves around the life of dancers. Nisha and Rahul (Shahrukh Khan) are best friends. Unknown to Rahul, Nisha secretly loved him. Pooja came into the scene after Nisha accidentally slipped her foot during dancing practice for Rahul's next production drama entitled Maya. Desperat...",negative
203,Dil Dhadakne Do,https://www.imdb.com/review/rw4191352/,"The people who are saying this movie is not good clearly don't know anything about films.Dil Dhhadakne do is one of my favourite films.It is inspiring,beautiful,funny and just a full package.The film has a soul and is just a masterpiece from the director Zoya Akhtar.",positive
204,Fukrey,https://www.imdb.com/review/rw2829690/,"Fukrey has got its base trembling with a plot so naive, viewer discretion is recommended. Its screenplay is bit confusing for people who don't pay attention. Otherwise, it is a healthy Bollywood comedy film.Cast performance is nothing to talk about but the actors have managed to create enough chemistry between themselves. Although love doesn't come center-stage, it amuses. The music is catchy, but the lilts could've been improvised. Same theme music is used all over the 130 minutes. Cinemato...",negative


In [None]:
y_true.head()

200    negative
201    positive
202    negative
203    positive
204    negative
Name: sentiment, dtype: object

In [None]:
X = df5.drop('sentiment',axis=1)

In [None]:
X

Unnamed: 0,movie,user_review_permalink,user_review
200,Main Hoon Na,https://www.imdb.com/review/rw1409633/,"But what was up with Zayed Khan's hair? And who thought to cast Rakhi Sawant as a college sex symbol! She looked truly miscast in that role.But anyways, this was a wonderful film, I very well nearly laughed myself into a coma at the comic bits. I can't believe I postponed seeing this film for so long.Shah Rukh Khan (Ram) stars as an Army major who endures a personal tragedy at the beginning of the film, then is ordered to go undercover to protect the commissioner's daughter (played by Amrita..."
201,Pardes,https://www.imdb.com/review/rw0428420/,"This is an excellent film, Sharukh Khan gives an astounding performance as Arjun, Mahima Choudhary puts a lot of effort in to her character as the chemistry between Mahima and Sharukh shows throughout the film. This is a different film from the others as it concentrates on the differences between countries. The songs in the film are also meaningful and romantic. Overall, this film is one of my favourite films."
202,Dil To Pagal Hai,https://www.imdb.com/review/rw0413181/,"I first saw the film in 1998, and I was never tired of seeing the film again and again. Karishma Kapoor as Nisha gave a first rate performance even though she played second fiddle to Madhuri Dixit (Pooja). The story revolves around the life of dancers. Nisha and Rahul (Shahrukh Khan) are best friends. Unknown to Rahul, Nisha secretly loved him. Pooja came into the scene after Nisha accidentally slipped her foot during dancing practice for Rahul's next production drama entitled Maya. Desperat..."
203,Dil Dhadakne Do,https://www.imdb.com/review/rw4191352/,"The people who are saying this movie is not good clearly don't know anything about films.Dil Dhhadakne do is one of my favourite films.It is inspiring,beautiful,funny and just a full package.The film has a soul and is just a masterpiece from the director Zoya Akhtar."
204,Fukrey,https://www.imdb.com/review/rw2829690/,"Fukrey has got its base trembling with a plot so naive, viewer discretion is recommended. Its screenplay is bit confusing for people who don't pay attention. Otherwise, it is a healthy Bollywood comedy film.Cast performance is nothing to talk about but the actors have managed to create enough chemistry between themselves. Although love doesn't come center-stage, it amuses. The music is catchy, but the lilts could've been improvised. Same theme music is used all over the 130 minutes. Cinemato..."
205,2 States,https://www.imdb.com/review/rw3004853/,"We all are quite familiar with movies which are based on novels, real- life stories, autobiographies etc. but few films exist which strikes the right chord! Many movies (in Bollywood) like 3 Idiots (2009), Kai Po Che (2013) have been adapted from novels of one masterpiece: Chetan Bhagat! He is the main reason behind the success of these movies and many others and again for the latest release of his novel ""2 States"" adaptation in Bollywood!! This movie is so wonderful and doesn't require any ..."
206,Agneepath,https://www.imdb.com/review/rw2555920/,what a boring movie. Audience here was so disappointed that they started leaving the cinema after the interval. They expected a romantic movie. I decided to stay on and did i regret. I have no choice but to say 'why Hrithik did u act in this remake'? He was just sleepwalking throughout the film and gets beaten up in the end shouting Agneepath 3 times. What a waste of money and talent. Hrithik i think u should stick to your own style and leave this to Salman khan. Sanjay Dutt looks awful and ...
207,Thappad,https://www.imdb.com/review/rw5519239/,"Speechless!! A must watch movie!! Beautiful part of movie is Men are not villain, Patriarchy is for which men and women both are responsible. This movie is as good as Marriage Story but the sad part is our country has limited audience to understand. I saw 6.1 which is disheartening and an insult to this film. Kudos to whole team specially to writer and director. Please continue to make such films."
208,Fashion,https://www.imdb.com/review/rw1970639/,"Dazzling beauties, scintillating faces and an unprecedented dark shadows of backlashes: backstage of the glamour world is grossly scaring and unbelievably haunting. Madhur Bhandarkar's next real life reel movie although not in the same league of his previous works but yet makes a strong mark on the mind of viewers. The wow of the movie is the way fashion shows have been presented, remarkable camera work to capture the zest of the glam, astonishing sets and most importantly sizzling Kangana a..."
209,Welcome,https://www.imdb.com/review/rw4916164/,What a time 2007 was!& right we are lacking for a good comedy movie...& yes Majnu bhai's painting is still famous in market.


In [None]:
y_true.to_csv('solution.csv', index=False)

In [None]:
X.head()

Unnamed: 0,movie,user_review_permalink,user_review
200,Main Hoon Na,https://www.imdb.com/review/rw1409633/,"But what was up with Zayed Khan's hair? And who thought to cast Rakhi Sawant as a college sex symbol! She looked truly miscast in that role.But anyways, this was a wonderful film, I very well nearly laughed myself into a coma at the comic bits. I can't believe I postponed seeing this film for so long.Shah Rukh Khan (Ram) stars as an Army major who endures a personal tragedy at the beginning of the film, then is ordered to go undercover to protect the commissioner's daughter (played by Amrita..."
201,Pardes,https://www.imdb.com/review/rw0428420/,"This is an excellent film, Sharukh Khan gives an astounding performance as Arjun, Mahima Choudhary puts a lot of effort in to her character as the chemistry between Mahima and Sharukh shows throughout the film. This is a different film from the others as it concentrates on the differences between countries. The songs in the film are also meaningful and romantic. Overall, this film is one of my favourite films."
202,Dil To Pagal Hai,https://www.imdb.com/review/rw0413181/,"I first saw the film in 1998, and I was never tired of seeing the film again and again. Karishma Kapoor as Nisha gave a first rate performance even though she played second fiddle to Madhuri Dixit (Pooja). The story revolves around the life of dancers. Nisha and Rahul (Shahrukh Khan) are best friends. Unknown to Rahul, Nisha secretly loved him. Pooja came into the scene after Nisha accidentally slipped her foot during dancing practice for Rahul's next production drama entitled Maya. Desperat..."
203,Dil Dhadakne Do,https://www.imdb.com/review/rw4191352/,"The people who are saying this movie is not good clearly don't know anything about films.Dil Dhhadakne do is one of my favourite films.It is inspiring,beautiful,funny and just a full package.The film has a soul and is just a masterpiece from the director Zoya Akhtar."
204,Fukrey,https://www.imdb.com/review/rw2829690/,"Fukrey has got its base trembling with a plot so naive, viewer discretion is recommended. Its screenplay is bit confusing for people who don't pay attention. Otherwise, it is a healthy Bollywood comedy film.Cast performance is nothing to talk about but the actors have managed to create enough chemistry between themselves. Although love doesn't come center-stage, it amuses. The music is catchy, but the lilts could've been improvised. Same theme music is used all over the 130 minutes. Cinemato..."


In [None]:
X.to_csv('test.csv', index=False)

In [None]:
df4.to_csv('train.csv', index=False)