#### STEPS in this notebook
1. import open data in the form of excel and csv and merge them
2. use the titles from merged data, randomly sample 200 and retrieve information using API call in the form of json
3. import json data and merge with intially merged data
4. export to excel file

In [1]:
import requests
import json
import pandas as pd

file_path = 'data/xlsx_data/movies_excel.xlsx'
excel_df = pd.read_excel(file_path, sheet_name='Sheet1')  # Specify the sheet name or index

print(len(excel_df))
excel_df['year'] = excel_df['Release Date'].dt.year.astype(int)
excel_df['Title_year'] = excel_df['Title'] + ' (' + excel_df['year'].astype(str) + ')'
excel_df.head()


3725


Unnamed: 0,Title,Release Date,Color/B&W,Genre,Language,Country,Rating,Lead Actor,Director Name,Lead Actor FB Likes,Cast FB Likes,Director FB Likes,Movie FB Likes,IMDb Score (1-10),Total Reviews,Duration (min),Gross Revenue,Budget,year,Title_year
0,Over the Hill to the Poorhouse,1920-09-15,Black and White,Crime,English,USA,Not Rated,Stephen Carr,Harry F. Millarde,2.0,4,0,0,4.8,1.0,110.0,3000000,100000,1920,Over the Hill to the Poorhouse (1920)
1,Metropolis,1927-01-26,Black and White,Drama,German,Germany,Not Rated,Brigitte Helm,Fritz Lang,136.0,203,756,12000,8.3,260.0,145.0,26435,6000000,1927,Metropolis (1927)
2,The Broadway Melody,1929-11-11,Black and White,Musical,English,USA,Passed,Anita Page,Harry Beaumont,77.0,109,4,167,6.3,36.0,100.0,2808000,379000,1929,The Broadway Melody (1929)
3,42nd Street,1933-08-29,Black and White,Comedy,English,USA,Unrated,Ginger Rogers,Lloyd Bacon,610.0,995,24,439,7.7,65.0,89.0,2300000,439000,1933,42nd Street (1933)
4,Top Hat,1935-04-15,Black and White,Comedy,English,USA,Approved,Ginger Rogers,Mark Sandrich,610.0,824,10,1000,7.8,66.0,81.0,3000000,609000,1935,Top Hat (1935)


In [2]:
csv_df = pd.read_csv("data/csv_data/imdb_top_1000.csv")
print(len(csv_df))
csv_df.rename(columns={'Series_Title': 'Title'}, inplace=True)
csv_df['Title_year'] = csv_df['Title'] + ' (' + csv_df['Released_Year'].astype(str) + ')'
print(csv_df.columns)

1000
Index(['Poster_Link', 'Title', 'Released_Year', 'Certificate', 'Runtime',
       'Genre', 'IMDB_Rating', 'Overview', 'Meta_score', 'Director', 'Star1',
       'Star2', 'Star3', 'Star4', 'No_of_Votes', 'Gross', 'Title_year'],
      dtype='object')


In [3]:
merged_df = pd.merge(excel_df, csv_df, on='Title_year', how='outer')
print(merged_df.columns)
print(len(merged_df))

Index(['Title_x', 'Release Date', 'Color/B&W', 'Genre_x', 'Language',
       'Country', 'Rating', 'Lead Actor', 'Director Name',
       'Lead Actor FB Likes', 'Cast FB Likes', 'Director FB Likes',
       'Movie FB Likes', 'IMDb Score (1-10)', 'Total Reviews',
       'Duration (min)', 'Gross Revenue', 'Budget', 'year', 'Title_year',
       'Poster_Link', 'Title_y', 'Released_Year', 'Certificate', 'Runtime',
       'Genre_y', 'IMDB_Rating', 'Overview', 'Meta_score', 'Director', 'Star1',
       'Star2', 'Star3', 'Star4', 'No_of_Votes', 'Gross'],
      dtype='object')
4353


In [4]:
merged_df['Combined_Title'] = merged_df['Title_x'].combine_first(merged_df['Title_y'])

code to for API call, executed once and saved in data/json_data

In [5]:

# Your API key and base URL
api_key = '8501dc49'
base_url = 'http://www.omdbapi.com/'

# List of movie titles to search
movie_titles = merged_df['Combined_Title'].sample(n=500, random_state=1).to_list()

# Container to hold all results
all_movies = []

# Loop through each movie title
for title in movie_titles:
    
    # Query parameters
    params = {
        'apikey': api_key,
        't': title
    }

    # Make the request
    url = requests.Request('GET', base_url, params=params).prepare().url
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()
        if data['Response'] == 'True':
            all_movies.append(data)
        else: print(title + "not found")

    else:
        print(f"Error: Unable to retrieve data for {title}")

# Save to a JSON file
with open('data/json_data/omdb_movies.json', 'w') as json_file:
    json.dump(all_movies, json_file, indent=4)

print(f"Successfully saved {len(all_movies)} records to omdb_movies.json")


500
X-Men 2not found
Per un pugno di dollarinot found
Cowboy Bebop: Tengoku no tobiranot found
Giù la testanot found
Les triplettes de Bellevillenot found
Mononoke-himenot found
Kari-gurashi no Ariettinot found
Tôkyô goddofâzâzunot found
4 luni, 3 saptamâni si 2 zilenot found
Do lok tin sinot found
Error: Unable to retrieve data for Rabbit Hole
Error: Unable to retrieve data for Beastmaster 2: Through the Portal of Time
Error: Unable to retrieve data for Dragon Wars: D-War
Madeonot found
Lat sau san taamnot found
Error: Unable to retrieve data for Stonewall
Error: Unable to retrieve data for Admission
Error: Unable to retrieve data for Talvar
Error: Unable to retrieve data for From a Whisper to a Scream
Error: Unable to retrieve data for Lady Vengeance
Error: Unable to retrieve data for The Orphanage
Error: Unable to retrieve data for The Nutty Professor
Error: Unable to retrieve data for The Back-up Plan
Error: Unable to retrieve data for Short Term 12
Error: Unable to retrieve data f

In [6]:
with open('data/json_data/omdb_movies.json', 'r') as file:
    data = json.load(file)
json_df = pd.json_normalize(data)


print(len(json_df))
json_df.rename(columns={'title': 'Title'}, inplace=True)
json_df['Title_year'] = json_df['Title'] + ' (' + json_df['Year'].astype(str) + ')'

238


In [7]:
merged_df = pd.merge(merged_df, json_df, on='Title_year', how='outer')
merged_df.to_excel('merged.xlsx', index=False)