# Project 3

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory=False)
ratings = pd.read_csv("Data/title_ratings.csv.gz", low_memory=False)
akas = pd.read_csv("Data/title_akas.csv.gz", low_memory=False)

basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [3]:
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1910
1,tt0000002,5.8,256
2,tt0000003,6.5,1714
3,tt0000004,5.6,169
4,tt0000005,6.2,2528


In [4]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


## Preprocessing

### Replace "\N" with np.nan

In [5]:
basics['runtimeMinutes'] = basics['runtimeMinutes'].replace({'\\N':np.nan})

### Eliminate movies that are null for runtimeMinutes and Eliminate movies that are null for genre

In [6]:
basics['runtimeMinutes'] = basics['runtimeMinutes'].dropna()
basics['genres'] = basics['genres'].dropna()

In [7]:
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle           11
originalTitle          11
isAdult                 0
startYear               0
endYear                 0
runtimeMinutes    6755417
genres                 10
dtype: int64

### keep only titleType==Movie

In [8]:
basics['titleType'] = basics['titleType'].replace({'\\N':np.nan})
basics['titleType'] = basics.drop(basics.loc[basics['titleType']!="movie"].index, inplace=True)
basics['titleType'].value_counts()

Series([], Name: titleType, dtype: int64)

### keep startYear 2000-2022

In [9]:
basics['startYear'].describe()

count     621341
unique       136
top           \N
freq       82833
Name: startYear, dtype: object

In [10]:
basics['startYear'] = basics['startYear'].replace({'\\N':np.nan})
basics['startYear'].describe()
basics['startYear'] = basics['startYear'].dropna()
basics['startYear'] = basics['startYear'].astype("float")

#### There's a problem here, I don't know what

In [11]:
basics['startYear'] = basics.drop(basics.loc[(basics['startYear'] < 2022) & (basics['startYear'] > 1999)].index, inplace=True)
basics['startYear'].describe()

count       0
unique      0
top       NaN
freq      NaN
Name: startYear, dtype: object

In [12]:
basics.dtypes

tconst            object
titleType         object
primaryTitle      object
originalTitle     object
isAdult           object
startYear         object
endYear           object
runtimeMinutes    object
genres            object
dtype: object

### Eliminate movies that include "Documentary" in genre (see tip below)

In [13]:
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

### Keep only US movies

In [14]:
akas['region'] = akas['region'].replace({'\\N':np.nan})
akas['region'] = akas.drop(akas.loc[akas['region']!="us"].index, inplace=True)
akas['region'].value_counts()

Series([], Name: region, dtype: int64)

In [15]:
keepers =basics['tconst'].isin(akas['titleId'])
keepers

8          False
498        False
570        False
587        False
610        False
           ...  
9228378    False
9228393    False
9228414    False
9228418    False
9228627    False
Name: tconst, Length: 321406, dtype: bool

In [16]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres


## Saving Files
### Creating a "Data" folder.

In [17]:
import os
os.makedirs('Data/',exist_ok=True) 
os.listdir("Data/")

['title_akas.csv.gz', 'title_basics.csv.gz', 'title_ratings.csv.gz']

### Saving Compressed Files

In [18]:
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [19]:
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

In [20]:
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)