# 1. Get info box for one movie from Wikipedia

In [1]:
from bs4 import BeautifulSoup as bs
import requests

In [2]:
r=requests.get("https://en.wikipedia.org/wiki/Pathaan_(film)")
soup=bs(r.content)
contents = soup.prettify()
# print(contents)

In [3]:
info_box=soup.find(class_="infobox vevent")
info_rows=info_box.find_all("tr")

# for row in info_rows:
#     print(row.prettify())

In [4]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0"," ") for li in row_data.find_all("li")]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0"," ")
movie_info={}

for index,row in enumerate(info_rows):
    if index==0 :
        movie_info['title']=row.find("th").get_text(" ", strip=True)
    elif index==1:
        continue
    else:
        try:
            content_key=row.find("th").get_text(" ", strip=True)
            content_value=get_content_value(row.find("td"))
            movie_info[content_key]=content_value
        except AttributeError:
            continue
        
        
    
# print(movie_info)

# 2. Get info box for all movies from Wikipedia

In [5]:
r=requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
soup=bs(r.content)
contents = soup.prettify()

# print(contents)

In [6]:
movies=soup.select(".wikitable.sortable i")
movies[0:10]
movies[0].a['title']

'Snow White and the Seven Dwarfs (1937 film)'

In [7]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0"," ") for li in row_data.find_all("li")]
    elif row_data.find("br"):
        return [text for text in row_data.stripped_strings]
        
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0"," ")

def clean(soup):
    for tag in soup.find_all(["sup", "span"]):
        tag.decompose()
        
def get_info_box(url):
    r=requests.get(url)
    soup=bs(r.content)
    
    info_box=soup.find(class_="infobox vevent")
    info_rows=info_box.find_all("tr")
    
    clean(soup)
    
    movie_info={}
    
    for index,row in enumerate(info_rows):
        if index==0 :
            movie_info['title']=row.find("th").get_text(" ", strip=True)
        else:
            try:
                header=row.find('th')
                if header:
                    content_key=row.find("th").get_text(" ", strip=True)
                    content_value=get_content_value(row.find("td"))
                    movie_info[content_key]=content_value
                
            except AttributeError:
                continue
    return movie_info


    

In [8]:
# testing clean sup
get_info_box('https://en.wikipedia.org/wiki/Davy_Crockett:_King_of_the_Wild_Frontier_(film)')

{'title': 'Davy Crockett: King of the Wild Frontier',
 'Directed by': 'Norman Foster',
 'Written by': 'Tom Blackburn',
 'Produced by': 'Bill Walsh',
 'Starring': ['Fess Parker', 'Buddy Ebsen'],
 'Cinematography': 'Charles P. Boyle',
 'Edited by': 'Chester W. Schaeffer',
 'Music by': ['Thomas W. Blackburn (lyrics)',
  'George Bruns',
  'Edward H. Plumb (orchestration)'],
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Film Distribution Co., Inc.',
 'Release date': 'May 25, 1955',
 'Running time': '93 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Box office': '$2,150,000 (US)'}

In [None]:
r=requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
soup=bs(r.content)
movies=soup.select(".wikitable.sortable i a")

base_path="https://en.wikipedia.org/"

movie_info_list=[]
for index,movie in enumerate(movies):
   
    try:
        relative_path=movie['href']
        
        full_path=base_path+relative_path
        title=movie['title']
        
        movie_info_list.append(get_info_box(full_path))
        
    except Exception as e:
        print(movie.get_text())
        print(e)


In [None]:
len(movie_info_list)

In [None]:
import json
def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data,f,ensure_ascii=False, indent=2)

In [None]:
import json

def load_data(title):
    with open(title, encoding="utf-8") as f:
        return json.load(f)

In [None]:
save_data("wiki_scrap_data.json", movie_info_list)

# 3. Clean the data

In [None]:
movie_info_list=load_data("wiki_scrap_data.json")

In [None]:
movie_info_list[-11]

## 3.1 Running time clean

In [None]:
# [movie.get('Running time','NA') for movie in movie_info_list]

In [None]:
def min_to_intr(running_time):
    if running_time == "NA":
        return None
    
    if isinstance(running_time, list):
        entry=running_time[0]
        entry.replace("–"," ")
        return int(entry.split(" ")[0])
    else:
        return int(running_time.replace("–"," ").split(" ")[0])
        
    

for movie in movie_info_list:
    movie['Running time (int)']=min_to_intr(movie.get("Running time", "NA"))

In [None]:
# print([movie.get('Running time (int)','NA') for movie in movie_info_list])

## 3.2 Budget & BoxOffice clean

In [None]:
# print([movie.get("Budget","NA") for movie in movie_info_list])

In [None]:
import re

amounts= r"thousand|million|billion"
number= r"\d+(,\d{3})*\.*\d*"
word_re= rf"\${number}(-|\sto\s|–)?({number})?\s({amounts})"
value_re= rf"\${number}"

def word_to_value(word):
    value_dict={"thousand":1000, "million": 1000000, "billion":1000000000}
    return value_dict[word]

def parse_word_syntax(string):
    value_string=re.search(number, string).group()
    value=float(value_string.replace(",",""))
    word=re.search(amounts, string, flags=re.I).group().lower()
    word_value=word_to_value(word)
    return value*word_value

def parse_val_syntax(string):
    value_str=re.search(number,string).group()
    value=float(value_str.replace(",",""))
    return value

def money_conversion(money):
    if money=="NA":
        return None
    
    if isinstance(money, list):
        money=money[0]
    word_syntax=re.search(word_re, money, flags=re.I)
    val_syntax=re.search(value_re, money)
    if word_syntax:
        return parse_word_syntax(word_syntax.group())
    elif val_syntax:
        return parse_val_syntax(val_syntax.group())
    else:
        return None

In [None]:
for movie in movie_info_list:
    movie['Budget (float)']=money_conversion(movie.get("Budget", "NA"))
    movie['Box office (float)']=money_conversion(movie.get("Box office", "NA"))

In [None]:
movie_info_list[1]

## 3.3 Date clean

In [None]:
# print([movie.get("Release date", "NA") for movie in movie_info_list])

In [None]:
from datetime import datetime

dates=[movie.get("Release date", "NA") for movie in movie_info_list]

def clean_date(date):
    return date.split("(")[0].strip()

def date_conv(date):
    if isinstance(date, list):
        date=date[0]
    if date=="NA":
        return None
    date_str=clean_date(date)
    
    formats=["%B %d, %Y", "%d %B %Y"]
    
    for form in formats:
        try:
            return datetime.strptime(date_str, form)
        except:
            pass
    return None
    

In [None]:
for movie in movie_info_list:
    if movie.get("Release date"):
            movie['Release date (datetime)']=date_conv(movie.get("Release date", "NA"))
    else:
            movie['Release date (datetime)']=date_conv(movie.get("Release dates", "NA"))

In [None]:
movie_info_list[100]

In [None]:
#save data using pickle

import pickle

def save_pickle(name,data):
    with open(name, "wb") as f:
        pickle.dump(data,f)

In [None]:
import pickle

def load_pickle(name):
    with open(name, "rb") as f:
        return pickle.load(f)

In [None]:
save_pickle("wiki_scrap_clean.pickle", movie_info_list)

In [None]:
a=load_pickle("wiki_scrap_clean.pickle")

In [None]:
a[1]

# 4. 3rd party ratings

In [None]:
movie_info_list=load_pickle("wiki_scrap_clean.pickle")

In [None]:
movie_info_list[-170]

In [None]:
# http://www.omdbapi.com/?apikey=[yourkey]&

In [None]:
import requests
import urllib

def get_omdb(title):
    base_url="http://www.omdbapi.com/?"
    params={'t':title,"apikey": '9f97ffc'} #add your own api key
    params_encoded=urllib.parse.urlencode(params)
    
    full_url=base_url+params_encoded
#     print(full_url)
    
    return requests.get(full_url).json()

def get_rotten_score(omdb_info):
    ratings=omdb_info.get("Ratings",[])
    for rating in ratings:
        if rating["Source"]=='Rotten Tomatoes':
            return rating['Value']
    return None

get_omdb('pathaan')

In [None]:
for movie in movie_info_list:
    title=movie['title']
    omdb_info=get_omdb(title)
    movie['imdb']=omdb_info.get('imdbRating', None)
    movie['rotten_tomatoes']=get_rotten_score(omdb_info)
    movie['metascore']=omdb_info.get("Metascore", None)
    

In [None]:
movie_info_list[4]

In [None]:
save_pickle("wiki_scrap_final.pickle", movie_info_list)

# 5. Save Dataset

In [None]:
movie_info_cp=[movie.copy() for movie in movie_info_list]

In [None]:
#converting datetime to string for storing in json
for movie in movie_info_cp:
    curr_date=movie['Release date (datetime)']
    if curr_date:
        movie['Release date (datetime)']=curr_date.strftime("%B %d, %Y")
    else:
        movie['Release date (datetime)']=None

In [None]:
movie_info_cp[269]

In [None]:
save_data("wiki_scrap_final.json", movie_info_cp)

In [None]:
#csv

import pandas as pd

df=pd.DataFrame(movie_info_list)


In [None]:
df.head()

In [None]:
df.to_csv("wiki_scrap_final.csv")

In [None]:
running_times=df.sort_values(["Running time (int)"], ascending=False)
running_times.head(10)