# Movies Analysis

## Imports

In [7]:
# Base Imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Data

In [8]:
#basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
#akas_url="https://datasets.imdbws.com/title.akas.tsv.gz"
#ratings_url="https://datasets.imdbws.com/title.ratings.tsv.gz"

In [9]:
#basics = pd.read_csv(basics_url,sep='\t', low_memory=False)

In [10]:
#akas = pd.read_csv(akas_url,sep='\t', low_memory=False)

In [11]:
#ratings = pd.read_csv(ratings_url,sep='\t', low_memory=False)

In [12]:

basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)

In [13]:
akas = pd.read_csv("Data/title_akas.csv.gz", low_memory = False)

In [14]:
ratings = pd.read_csv("Data/title_ratings.csv.gz", low_memory = False)

## Basics Preprocessing

In [15]:
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0094859,movie,Chief Zabu,Chief Zabu,0,2016,,74,Comedy
...,...,...,...,...,...,...,...,...,...
82145,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019,,74,Drama
82146,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019,,97,"Comedy,Drama,Fantasy"
82147,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
82148,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"


Notes:
* Replace "\N" with np.nan
* Eliminate movies that are null for runtimeMinutes
* Eliminate movies that are null for genre
* keep only titleType == movie
* keep startYear 2000-2022
* Eliminate movies that include "Documentary" in genre (see tip below)

In [16]:
#scanning dtypes
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82150 entries, 0 to 82149
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          82150 non-null  object 
 1   titleType       82150 non-null  object 
 2   primaryTitle    82150 non-null  object 
 3   originalTitle   82150 non-null  object 
 4   isAdult         82150 non-null  int64  
 5   startYear       82150 non-null  int64  
 6   endYear         0 non-null      float64
 7   runtimeMinutes  82150 non-null  int64  
 8   genres          82150 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 5.6+ MB


> Noticed an issue with 'runtimeMinutes'. Some of the records have text values, pretty sure this is supposed to be a datetime dtype.

In [None]:
#replacing \N to np.nan
basics.replace({'\\N':np.nan}, inplace=True)

In [None]:
#dropping NaNs
basics = basics.dropna(axis=0, how='any', thresh=None, subset=['runtimeMinutes', 'genres'], inplace=False)

In [None]:
#instantiating filters for preproc
basics_filtered = basics[(basics.titleType == "movie") & ((basics['startYear'] >= "2000") & (basics['startYear'] <= "2022"))]

In [None]:
#verifying filters
basics_filtered.head()

In [None]:
#excluding movies with 'documentary'
is_documentary = basics_filtered['genres'].str.contains('documentary',case=False)
basics_cleaned = basics_filtered[~is_documentary]

In [None]:
#verifying 'documentary' dropped
basics_cleaned

## AKAs Preprocessing

In [None]:
#replacing \N to np.nan
akas.replace({'\\N':np.nan}, inplace=True)

In [17]:
#checking
akas

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0
...,...,...,...,...,...,...,...,...
1342951,tt9916702,1,Loving London: The Playground,US,,,,0.0
1342952,tt9916720,10,The Demonic Nun,US,,tv,,0.0
1342953,tt9916720,12,The Nun 2,US,,imdbDisplay,,0.0
1342954,tt9916756,1,Pretty Pretty Black Girl,US,,imdbDisplay,,0.0


In [None]:
#US entries only
akas_us = akas['region'] == 'US'

In [None]:
#variable for US only
akas_cleaned = akas.loc[akas_us]

In [None]:
#checking
akas_cleaned

## Ratings Preprocessing

In [None]:
#replacing \N to np.nan
ratings.replace({'\\N':np.nan}, inplace=True)

In [18]:
#checking
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1905
1,tt0000002,5.9,256
2,tt0000003,6.5,1702
3,tt0000004,5.7,168
4,tt0000005,6.2,2517
...,...,...,...
1263577,tt9916690,6.5,6
1263578,tt9916720,5.1,245
1263579,tt9916730,8.4,6
1263580,tt9916766,6.8,21


## Filtering Basics on AKAs

In [None]:
#filter basics table down to US only
keepers =basics_cleaned['tconst'].isin(akas_cleaned['titleId'])
keepers

In [None]:
#filtering on basics cleaned
basics_cleaned = basics_cleaned[keepers]
basics_cleaned

In [None]:
#importing filtered dataframes to Data folder
#import os
#os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
#os.listdir("Data/")

In [None]:
#basics compressed to csv
#basics_cleaned.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [None]:
#akas compressed to csv
#akas_cleaned.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

In [None]:
#ratings compressed to csv
#ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)