## Data Collecting (Web Scrappping)

In [9]:
#We started by collecting the data we needed through Web scrapping

#packages needed
import numpy as np
import pandas as pd
import requests
from requests import get
from bs4 import BeautifulSoup
import json

#imported to add more human behavior
from time import sleep
from random import randint

headers = {'Accept-Language': 'en-US,en;q=0.5'}

#empty list containers 
metascores = []
gross_total = []
years = []
titles = []
imdb_ratings = []
actors = []
genres = []
directors = []
movie_ratings = []
votes = []


#Setting range
pages = np.arange(1, 1001, 50)

#Initial for loop to go through webpages set up with time delays
for page in pages:
    #setting each loop to be the url
    page = requests.get("https://www.imdb.com/search/title/?groups=top_1000&start=" + str(page) + "&ref_=adv_nxt", headers=headers)
    #using soup to parse through each new url
    soup = BeautifulSoup(page.text, 'html.parser')
    #initial container for each movie
    movie_div = soup.find_all('div', class_='lister-item mode-advanced')
    #created random intervals of waiting before next url is loaded
    sleep(randint(2,10))
    #loops through each container in each page
    for cont in movie_div:
        #setting each variable to push into the empty list
        movie_title = cont.h3.a.text
        titles.append(movie_title)
        
        movie_year = cont.h3.find('span', class_='lister-item-year').text
        years.append(movie_year)
        
        movie_rating = cont.p.find('span', class_='certificate').text if cont.p.find('span', class_='certificate') else  ''
        movie_ratings.append(movie_rating)
        
        movie_genre = cont.p.find('span', class_='genre').text if cont.p.find('span', class_='genre') else ''
        genres.append(movie_genre)
        
        imdb_rating = float(cont.strong.text)
        imdb_ratings.append(imdb_rating)
        
        metascore = cont.find('span', class_= 'metascore').text if cont.find('span', class_='metascore') else ''
        metascores.append(metascore)
        
        cast_and_crew = cont.findAll('p', class_='')[0].findAll('a')
        directors.append((cast_and_crew[0]).text)
        
        val = cont.find_all('span', attrs={'name':'nv'})
        vote = val[0].text
        votes.append(vote)
        
        grosses = val[1].text if len(val) > 1 else ''
        gross_total.append(grosses)
#creating dataframes by turning the list into series
movies = pd.DataFrame({
    'Movie_name' : titles,
    'Year' : years,
    'Metascore': metascores,
    'IMDB_score' : imdb_ratings,
    'Movie_rating' : movie_ratings,
    'Votes' : votes,
    'Domestic_gross' : gross_total,
    'Director' : directors,
    'Genres' : genres
})

In [15]:
#Checking the initial return data
movies.head()

Movie_name         object
Year               object
Metascore          object
IMDB_score        float64
Movie_rating       object
Votes               int64
Domestic_gross     object
Director           object
Genres             object
dtype: object

In [11]:
movies['Votes'] = movies['Votes'].str.replace(',', '').astype(int)
#taking out commas and turning into integers

In [None]:
movies.loc[:, 'Year'] = movies['Year'].str[-5:-1].astype(int)
#taking out parenthesis and turning into integers

In [None]:
movies.loc[movies['Movie_rating'] == 'GP', 'Movie_rating'] = 'G'
#combining the two ratings that mean the same

In [None]:
movies['Domestic_gross'] = movies['Domestic_gross'].map(lambda x: x.lstrip('$').rstrip('M')) 
# strip the characters
movies['Domestic_gross'] = pd.to_numeric(movies['Domestic_gross'], errors='coerce')
# turn into a number 


In [None]:
movies["Genres"] = movies["Genres"].map(lambda x: x.lstrip('\n'))
#removing the begining characters in Genres

In [None]:
movies['Metascore'] = movies['Metascore'].str.extract('(\d+)') 
#grabbing the value of each metascore 
movies['Metascore'] = pd.to_numeric(movies['Metascore'], errors='coerce')
#and changing the value to be a float

In [None]:
movies.to_csv('IMDB_Top_1000.csv')

In [None]:
imdb_movies = pd.read_csv("IMDB_TOP_1000.csv")

In [None]:
imdb_movies

## Data Collecting  (Api Call)

In [None]:
url = 'https://www.imdb.com/chart/top'
response = requests.get(url)

type(response.text)
#Raw html data 
#response.text
soup = BeautifulSoup(response.text, 'html.parser')
type(soup)
movietags = soup.select('td.titleColumn')
#movietags[0]

In [None]:
api_key = "69ad870bdcbbf9cc540c7a4e4d400bed"

In [None]:
movie_db = requests.get('https://api.themoviedb.org/3/movie/550?api_key=69ad870bdcbbf9cc540c7a4e4d400bed')
movie_db.status_code
movie_db = json.loads(movie_db.text)

In [None]:
top_rated = requests.get('https://api.themoviedb.org/3/movie/top_rated?api_key=69ad870bdcbbf9cc540c7a4e4d400bed&language=en-US&page=1')
top_rated_json = json.loads(top_rated.text)
top_rated_json.keys()

In [None]:
top_movies = top_rated_json['results']
top_movies = pd.DataFrame(top_rated_json['results'])
top_movies

In [None]:
new_movies_df = top_movies[(top_movies['release_date'].str.startswith('201'))]
new_movies_df

## Analyzing Data 

In [None]:
imdb_movies.isnull().sum()

In [None]:
imdb_movies.drop(['Metascore', 'Votes'], axis = 1, inplace = True)

In [None]:
#Turning off the options that limits the display
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [None]:
#Since we had several genres that apply to each movie, we split them by primary and secondary genres
imdb_movies['Primary_Genre'] = imdb_movies.Genres.str.split(',', n = 1, expand = True)[0]
imdb_movies['Secondary_Genre'] = imdb_movies.Genres.str.split(',', n = 1, expand = True)[1]

In [None]:
#Now we're seeing how many movies are in each genre
imdb_movies['Primary_Genre'].value_counts()

In [None]:
#creating individual dataframes for each genre to make plotting easier
drama_df = imdb_movies[(imdb_movies['Primary_Genre'] == 'Drama')]
action_df = mdb_movies[(imdb_movies['Primary_Genre'] == 'Action')]
comedy_df = imdb_movies[(imdb_movies['Primary_Genre'] == 'Comedy')]
crime_df = imdb_movies[(imdb_movies['Primary_Genre'] == 'Crime')]
bio_df = imdb_movies[(imdb_movies['Primary_Genre'] == 'Biography')]

In [None]:
#getting rid of index values that don't apply
new_drama_df = drama_df[(drama_df['Movie_rating'] != "Unrated")]
new_action_df = action_df[(action_df['Movie_rating'] != 'Unrated')]
new_comedy_df = comedy_df[(comedy_df['Movie_rating'] != 'Unrated')]
new_crime_df = crime_df[(crime_df['Movie_rating'] != 'Unrated')]
new_bio_df = bio_df[(bio_df['Movie_rating'] != 'Unrated')]

In [None]:
y1 = new_drama_df.groupby(['Movie_rating'])['Domestic_gross'].mean()
x1 = y1.index

y2 = new_action_df.groupby(['Movie_rating'])['Domestic_gross'].mean()
x2 = y2.index

y3 = new_comedy_df.groupby(['Movie_rating'])['Domestic_gross'].mean()
x3 = y3.index

y4 = new_crime_df.groupby(['Movie_rating'])['Domestic_gross'].mean()
x4 = y4.index

y5 = new_bio_df.groupby(['Movie_rating'])['Domestic_gross'].mean()
x5 = y5.index

plt.figure(figsize = (8,6))

plt.plot(x1,y1,'r',label = 'Drama')
plt.plot(x2,y2,'y',label = 'Action')
plt.plot(x3,y3,'g',label = 'Comedy')
plt.plot(x4,y4,'b',label = 'Crime')
plt.plot(x5, y5,'k',label = 'Bio-Pic') 

plt.legend()

plt.title('Ratio of Rating vs Gross Revenue')
plt.ylabel('Gross Revenue')
plt.xlabel('Rating')
plt.show()

In [None]:
df.sort_values(by=['Domestic_gross', 'Director'], ascending = False).head(10)

In [None]:
director_df = df.sort_values(by=['Domestic_gross', 'Director'], ascending = False).head(10)
director_df

In [None]:
plt.figure(figsize = (12,8))
y = director_df.groupby(['Director'])['Domestic_gross'].mean()
x = y.index
plt.bar(x,y)

In [None]:
y1 = new_drama_df.groupby(['IMDB_score'])['Domestic_gross'].mean()
x1 = y1.index

y2 = new_action_df.groupby(['IMDB_score'])['Domestic_gross'].mean()
x2 = y2.index

y3 = new_comedy_df.groupby(['IMDB_score'])['Domestic_gross'].mean()
x3 = y3.index

y4 = new_crime_df.groupby(['IMDB_score'])['Domestic_gross'].mean()
x4 = y4.index

y5 = new_bio_df.groupby(['IMDB_score'])['Domestic_gross'].mean()
x5 = y5.index

plt.figure(figsize = (8,6))

plt.plot(x1,y1,'r',label = 'Drama')
plt.plot(x2,y2,'y',label = 'Action')
plt.plot(x3,y3,'g',label = 'Comedy')
plt.plot(x4,y4,'b',label = 'Crime')
plt.plot(x5, y5,'k',label = 'Bio-Pic') 

plt.legend()

plt.title('Ratio of IMDB Score vs Gross Revenue')
plt.ylabel('Gross Revenue')
plt.xlabel('Score')
plt.show()