# Movie Analytics Project 

## Overview

Overview Here:

### Loading in and Investigating the Data

In [18]:
import pandas as pd
import sqlite3
import zipfile

movie_gross = pd.read_csv('bom.movie_gross.csv.gz', compression='gzip')
print(movie_gross.info())

with zipfile.ZipFile('im.db.zip', 'r') as zip_ref:
    zip_ref.extractall('.')

conn = sqlite3.connect('im.db')

tables = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", conn)
print(tables)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           3387 non-null   object 
 1   studio          3382 non-null   object 
 2   domestic_gross  3359 non-null   float64
 3   foreign_gross   2037 non-null   object 
 4   year            3387 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 132.4+ KB
None
            name
0   movie_basics
1      directors
2      known_for
3     movie_akas
4  movie_ratings
5        persons
6     principals
7        writers


In [19]:
movie_gross.head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010


### Cleaning Movie Gross Dataset

In [20]:
# Converting 'foreign_gross' to numeric so it can be used in analysis
movie_gross['foreign_gross'] = pd.to_numeric(movie_gross['foreign_gross'].str.replace(',', ''), errors='coerce')

print(movie_gross.dtypes)

title              object
studio             object
domestic_gross    float64
foreign_gross     float64
year                int64
dtype: object


In [22]:
movie_gross.isnull().sum()

title                0
studio               5
domestic_gross      28
foreign_gross     1350
year                 0
dtype: int64

In [23]:
# Checking out the rows that have null values in studio
null_studio = movie_gross[movie_gross['studio'].isnull()]
print("Rows with null 'studio':")
print(null_studio)

Rows with null 'studio':
                              title studio  domestic_gross  foreign_gross  \
210   Outside the Law (Hors-la-loi)    NaN         96900.0      3300000.0   
555         Fireflies in the Garden    NaN         70600.0      3300000.0   
933           Keith Lemon: The Film    NaN             NaN      4000000.0   
1862                 Plot for Peace    NaN          7100.0            NaN   
2825               Secret Superstar    NaN             NaN    122000000.0   

      year  
210   2010  
555   2011  
933   2012  
1862  2014  
2825  2017  


In [25]:
# Manually updating the missing studios since there are only 5

studios = {
    210: 'Tessalit Productions',
    555: 'Senator Distribution',
    933: 'Lionsgate UK',
    1862: 'Indelible Media',
    2825: 'Aamir Khan Productions'
}

for index, studio in studios.items():
    movie_gross.at[index, 'studio'] = studio

updated_rows = movie_gross.loc[list(studios.keys())]
print(updated_rows)


                              title                  studio  domestic_gross  \
210   Outside the Law (Hors-la-loi)    Tessalit Productions         96900.0   
555         Fireflies in the Garden    Senator Distribution         70600.0   
933           Keith Lemon: The Film            Lionsgate UK             NaN   
1862                 Plot for Peace         Indelible Media          7100.0   
2825               Secret Superstar  Aamir Khan Productions             NaN   

      foreign_gross  year  
210       3300000.0  2010  
555       3300000.0  2011  
933       4000000.0  2012  
1862            NaN  2014  
2825    122000000.0  2017  


In [26]:
# Dropping rows where domestic gross is null
movie_gross.dropna(subset=['domestic_gross'], inplace=True)

In [27]:
movie_gross.isnull().sum()

title                0
studio               0
domestic_gross       0
foreign_gross     1350
year                 0
dtype: int64

In [28]:
# Creating another dataset with rows where foreign gross is not null for possible separate analysis later on
foreign_gross_not_null = movie_gross.dropna(subset=['foreign_gross'])

In [29]:
# Standardizing names in studio and title
def standardize_text(text):
    return text.strip().lower()

movie_gross['title'] = movie_gross['title'].apply(standardize_text)
movie_gross['studio'] = movie_gross['studio'].apply(standardize_text)
foreign_gross_not_null['title'] = foreign_gross_not_null['title'].apply(standardize_text)
foreign_gross_not_null['studio'] = foreign_gross_not_null['studio'].apply(standardize_text)

print(movie_gross.head())
print(foreign_gross_not_null.head())


                                         title studio  domestic_gross  \
0                                  toy story 3     bv     415000000.0   
1                   alice in wonderland (2010)     bv     334200000.0   
2  harry potter and the deathly hallows part 1     wb     296000000.0   
3                                    inception     wb     292600000.0   
4                          shrek forever after   p/dw     238700000.0   

   foreign_gross  year  
0    652000000.0  2010  
1    691300000.0  2010  
2    664300000.0  2010  
3    535700000.0  2010  
4    513900000.0  2010  
                                         title studio  domestic_gross  \
0                                  toy story 3     bv     415000000.0   
1                   alice in wonderland (2010)     bv     334200000.0   
2  harry potter and the deathly hallows part 1     wb     296000000.0   
3                                    inception     wb     292600000.0   
4                          shrek forever after

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  foreign_gross_not_null['title'] = foreign_gross_not_null['title'].apply(standardize_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  foreign_gross_not_null['studio'] = foreign_gross_not_null['studio'].apply(standardize_text)


In [30]:
#Checking for duplicates in both datasets
duplicates_movie_gross = movie_gross.duplicated().sum()
print(f"Number of duplicate rows in movie_gross: {duplicates_movie_gross}")

if duplicates_movie_gross > 0:
    print("Duplicate rows in movie_gross:")
    print(movie_gross[movie_gross.duplicated()])

duplicates_foreign_gross_not_null = foreign_gross_not_null.duplicated().sum()
print(f"Number of duplicate rows in foreign_gross_not_null: {duplicates_foreign_gross_not_null}")

if duplicates_foreign_gross_not_null > 0:
    print("Duplicate rows in foreign_gross_not_null:")
    print(foreign_gross_not_null[foreign_gross_not_null.duplicated()])


Number of duplicate rows in movie_gross: 0
Number of duplicate rows in foreign_gross_not_null: 0
