<a href="https://colab.research.google.com/github/ZakriaJanjua/Dataset-Creation/blob/main/DatasetCreation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task-1: Get info box 

In [None]:
from bs4 import BeautifulSoup as bs
import requests 

In [None]:
r = requests.get('https://en.wikipedia.org/wiki/Toy_Story_3')

soup = bs(r.content)

In [None]:
info_box = soup.find(class_="infobox vevent")

In [None]:
info_rows = info_box.find_all('tr')

In [None]:
def get_content_value(row_data):
    if row_data.find('li'):
        return[li.get_text(' ', strip=True).replace('\xa0', ' ') for li in row_data.find_all('li')]
    else:
        return row_data.get_text(' ', strip=True).replace('\xa0', ' ')

movie_info = {}

for index, row in enumerate(info_rows):
    if index == 0:
        movie_info['title'] = row.find('th').get_text()
    elif index == 1:
        continue
    else:
        movie_info[row.find('th').get_text(' ', strip=True)] = get_content_value(row.find('td'))
print(movie_info)

# Task-2: Get info box for all movies

In [None]:
from bs4 import BeautifulSoup as bs
import requests 

r = requests.get('https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')

soup = bs(r.content)

In [None]:
movies = soup.select('.wikitable.sortable i a')
movies[0]['href']

In [None]:
def get_content_value(row_data):
    if row_data.find('li'):
        return[li.get_text(' ', strip=True).replace('\xa0', ' ') for li in row_data.find_all('li')]
    elif row_data.find('br'):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(' ', strip=True).replace('\xa0', ' ')

def clean_tags(tags):
    for tag in tags.find_all(["sup", "span"]):
        tag.decompose()

def get_info_box(url):
    
    r = requests.get(url)
    webpage = bs(r.content)
    info_box = webpage.find(class_="infobox vevent")
    info_rows = info_box.find_all('tr')
    
    clean_tags(webpage)

    movie_info = {}
    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['title'] = row.find('th').get_text()
        else:
            header = row.find('th')
            if header:
                movie_key = header.get_text(' ', strip=True)
                movie_content = get_content_value(row.find('td')) 
                movie_info[movie_key] = movie_content

    return movie_info


In [None]:
get_info_box('https://en.wikipedia.org/wiki/One_Little_Indian_(film)')

In [None]:
movie_info_list = []
base_path = 'https://en.wikipedia.org/'

for index, movie in enumerate(movies):
    if index % 10 == 0:
        print(index)
    try:
        relative_path = movie['href']
        title = movie['title']
        full_path = base_path + relative_path
        movie_info_list.append(get_info_box(full_path))    
    except Exception as e:
        print(movie.get_text())
        print(e)

In [None]:
print(len(movie_info_list))

### Save/Load data

In [None]:
import json

def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [None]:
import json

def load_data(title):
    with open(title, encoding='utf-8') as f:
        return json.load(f)

In [None]:
save_data('disney_data.json', movie_info_list)

In [None]:
movie_data = load_data('/content/disney_data.json')

In [None]:
movie_data[-11]

In [None]:
print([movie.get('Running time', 'N/A') for movie in movie_info_list])

In [None]:
def minutes_to_integer(running_time):
    if running_time == 'N/A':
        return None
    elif isinstance(running_time, list):
        return int(running_time[0].split(' ')[0]) 
    else:    
        return int(running_time.split(' ')[0])
    
print(minutes_to_integer(['90 minutes, 80 minutes']))    

In [None]:
for movie in movie_info_list:
    movie['Running time (int)'] = minutes_to_integer(movie.get('Running time', 'N/A'))

In [None]:
print([movie.get('Running time (int)') for movie in movie_info_list])

In [None]:
print([movie.get('Budget', 'N/A') for movie in movie_info_list])

In [None]:
import re

number = r'\d+(,\d{3})*\.*\d*'
amounts = r'thousand|million|billion'

value_re = rf'\${number}'
word_re = rf'\${number}(-|-|\sto\s)?({number})?\s({amounts})'

def word_to_value(word):
    value_dict = {'thousand': 1000, 'million': 1000000, 'billion': 1000000000}
    return value_dict[word]

def parse_word_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(',',''))
    word = re.search(amounts, string, flags=re.I).group()
    word_value = word_to_value(word)
    return value * word_value

def parse_value_syntax(string):
    value_string = re.search(number, string, flags=re.I).group().lower() 
    value = float(value_string.replace(',', ''))
    return value

def money_conversion(money):
    
    if money == 'N/A':
        return None

    if isinstance(money, list):
        money = money[0]
    
    word_syntax = re.search(word_re, money)
    value_syntax = re.search(value_re, money)
    
    if word_syntax:
        return parse_word_syntax(word_syntax.group())
    elif value_syntax:
        return parse_value_syntax(value_syntax.group())
    else:
        return None

In [None]:
for movie in movie_info_list:
    movie['Budget (float)'] = money_conversion(movie.get('Budget', 'N/A'))
    movie['Box office (float)'] = money_conversion(movie.get('Box office', 'N/A'))
    

In [None]:
[movie.get('Release date', 'N/A') for movie in movie_info_list]

In [None]:
from datetime import datetime

dates = [movie.get('Release date', 'N/A') for movie in movie_info_list]

def clean_date(date):
    return date.split('(')[0].strip()

def date_conversion(date):
    if isinstance(date, list):
        date = date[0]

    if date == 'N/A':
        return None
    date_str = clean_date(date)
    formats = ['%B %d, %Y', '%d %B %Y']
    
    for format in formats:
        try:
            return datetime.strptime(date_str, format) 
        except:
            pass
    return None

In [None]:
for movie in movie_info_list:
    movie['Release date (datetime)'] = date_conversion(movie.get('Release date', 'N/A'))

In [None]:
movie_info_list[-94]

## Save/Load using Pickle

In [None]:
import pickle

def save_data_pickle(name, data):
    with open(name, 'wb') as f:
        pickle.dump(data, f)

In [None]:
def load_data_pickle(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [None]:
save_data_pickle('disney_data_cleaned.pickle', movie_info_list)

In [None]:
movie_info_list = load_data_pickle('disney_data_cleaned.pickle')

# Task-3: IMDB, Ratings, Metascore

In [None]:
import requests
import urllib

base_url = 'http://www.omdbapi.com/?'

def get_omdb_info(title):
    # hide your api key 
    parameters = {'apikey': '7198b474' ,'t': title}
    params_encoded = urllib.parse.urlencode(parameters)
    full_url = base_url + params_encoded
    return requests.get(full_url).json()

def get_ratings(omdb_info):
    ratings = omdb_info.get('Ratings', [])
    for rating in ratings:
        if rating['Source'] == 'Metacritic':
            return rating['Value']
    return None

In [None]:
info = get_omdb_info('beauty and the beast')

In [None]:
get_ratings(info)

In [None]:
for movie in movie_info_list:
    title = movie['title']
    omdb_info = get_omdb_info(title)
    movie['imdb'] = omdb_info.get('imdbRating', None)
    movie['metascore'] = omdb_info.get('Metascore', None)
    movie['ratings'] = get_ratings(omdb_info)

In [None]:
movie_info_list[-53]

In [None]:
save_data_pickle('disney_data_final.pickle', movie_info_list)

# Task-4: Save data as JSON and CSV

In [None]:
movie_info_copy = [movie.copy() for movie in movie_info_list]

In [None]:
movie_info_copy[-53]

In [None]:
for movie in movie_info_copy:
    current_date = movie['Release date (datetime)']
    if current_date:
        movie['Release date (datetime)'] = current_date.strftime('%B %d, %Y')
    else:
        movie['Release date (datetime)'] = None

In [None]:
movie_info_copy[-53]

In [None]:
save_data('disney_data_final.json', movie_info_copy)

In [None]:
import pandas as pd
df = pd.DataFrame(movie_info_list)  

In [None]:
df.head()

In [None]:
df.to_csv('disney_data_final.csv')

In [None]:
df.info()

In [None]:
running_time = df.sort_values(['Running time (int)'], ascending=True)
running_time.head()