## **I) IMPORTS & EXPLORATION**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

### **1) DATASETS URLS**

In [2]:
namebasics_url = "https://datasets.imdbws.com/name.basics.tsv.gz"
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
crew_url = "https://datasets.imdbws.com/title.crew.tsv.gz"
principals_url = "https://datasets.imdbws.com/title.principals.tsv.gz"
ratings_url ="https://datasets.imdbws.com/title.ratings.tsv.gz"

### **2) LOADING DATASETS & CLEANING**

#### **2.1 ratings dataset**

In [3]:
# ratings dataset loading: 
ratings = pd.read_csv(ratings_url, sep='\t', na_values='\\N', dtype={'averageRating': float, 'numVotes': int, 'tconst': str})
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2187
1,tt0000002,5.5,307
2,tt0000003,6.5,2275
3,tt0000004,5.1,196
4,tt0000005,6.2,3012


In [None]:
# Exploration of ratings dataset
print(ratings.info())
print(ratings.duplicated().sum())
print(ratings.isnull().sum())
ratings['numVotes'].describe().round(2)
ratings['averageRating'].describe().round(2)

#### **2.2 basics dataset**

In [5]:
# basics dataset loading: 
basics = pd.read_csv(
	basics_url,
	sep='\t',
	na_values='\\N',
	dtype={
		'tconst': str,
		'titleType': str,
		'primaryTitle': str,
		'originalTitle': str,
		'isAdult': 'Int64',
		'startYear': 'Int64',
		'endYear': str,
		'runtimeMinutes': str,
		'genres': str
	}
)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Poor Pierrot,Pauvre Pierrot,0,1892,,5,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,Short


In [6]:
# filter isAdult movies
# filter startYear starting from 1980 to today(year()) (adapting to users' age)
# filter movies include only movies (not documentaries, shorts, tvshows or series)
current_year = pd.Timestamp.now().year
basics = basics[(basics['isAdult'] == 0) &
                (basics['startYear'] >= 1980) & 
                (basics['startYear'] <= current_year) &
                (basics['titleType'] == 'movie')
                ]

#### **2.3 akas dataset**

In [8]:
# akas dataset loading:
akas = pd.read_csv(akas_url, 
                   sep='\t', 
                   na_values='\\N', 
                   dtype={'titleId': str, 
                          'ordering': int, 
                          'title': str, 
                          'region': str, 
                          'language': str, 
                          'types': str, 
                          'attributes': str, 
                          'isOriginalTitle': 'Int64'})
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Carmencita,,,original,,1
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita,US,,imdbDisplay,,0
3,tt0000001,4,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
4,tt0000001,5,Καρμενσίτα,GR,,imdbDisplay,,0


In [None]:
# filter region 'FR' for France
akas = akas[akas['region'] == 'FR']

#### **2.4 principals dataset**

In [10]:
principals = pd.read_csv(principals_url, 
                         sep='\t',
                         na_values='\\N',
                         dtype={'tconst': str,
                                'ordering': int,
                                'nconst': str,
                                'category': str,
                                'job': str,
                                'characters': str})
principals.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,,"[""Self""]"
1,tt0000001,2,nm0005690,director,,
2,tt0000001,3,nm0005690,producer,producer,
3,tt0000001,4,nm0374658,cinematographer,director of photography,
4,tt0000002,1,nm0721526,director,,


In [11]:
# filter principals to keep only actors and actresses and directors
# keep only theses columns : tconst, nconst, category
# remove duplicates if any
# check for nulls (no nulls) 
principals = principals[principals['category'].isin(['actor', 'actress', 'director'])]
principals = principals[['tconst', 'nconst', 'category']]
principals = principals.drop_duplicates()
principals.info()

<class 'pandas.core.frame.DataFrame'>
Index: 46934247 entries, 1 to 96798092
Data columns (total 3 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   tconst    object
 1   nconst    object
 2   category  object
dtypes: object(3)
memory usage: 1.4+ GB


#### **2.4 namebasics dataset**

In [17]:
namebasics = pd.read_csv(namebasics_url, 
                        sep='\t',      
                        na_values='\\N',
                        dtype={'nconst': str,
                               'primaryName': str,
                               'birthYear': 'Int64',    
                               'deathYear': 'Int64',
                               'primaryProfession': str,
                               'knownForTitles': str})
namebasics.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987.0,"actor,miscellaneous,producer","tt0072308,tt0050419,tt0027125,tt0025164"
1,nm0000002,Lauren Bacall,1924,2014.0,"actress,miscellaneous,soundtrack","tt0037382,tt0075213,tt0038355,tt0117057"
2,nm0000003,Brigitte Bardot,1934,,"actress,music_department,producer","tt0057345,tt0049189,tt0056404,tt0054452"
3,nm0000004,John Belushi,1949,1982.0,"actor,writer,music_department","tt0072562,tt0077975,tt0080455,tt0078723"
4,nm0000005,Ingmar Bergman,1918,2007.0,"writer,director,actor","tt0050986,tt0069467,tt0083922,tt0050976"


In [23]:
namebasics.info()
namebasics= namebasics.dropna(subset=["primaryName"])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14966884 entries, 0 to 14966883
Data columns (total 6 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   nconst             object
 1   primaryName        object
 2   birthYear          Int64 
 3   deathYear          Int64 
 4   primaryProfession  object
 5   knownForTitles     object
dtypes: Int64(2), object(4)
memory usage: 713.7+ MB


## **II) FINAL DATASETS** 

In [None]:
# We keep and export the cleaned datasets: ratings and principals (distinguish actors/actresses and directors)
# Merge basics and akas into a new dataframe called movies 
movies = pd.merge(basics, akas, left_on='tconst', right_on='titleId', how='inner')

In [14]:
# we keep only useful columns
movies = movies[['tconst', 'primaryTitle', 'originalTitle', 'startYear', 'runtimeMinutes', 'genres', 'region', 'title']]
movies.head()

Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,region,title
0,tt0035423,Kate & Leopold,Kate & Leopold,2001,118,"Comedy,Fantasy,Romance",FR,Kate et Léopold
1,tt0036606,"Another Time, Another Place","Another Time, Another Place",1983,118,"Drama,War",FR,Les Coeurs captifs
2,tt0038687,Let There Be Light,Let There Be Light,1980,58,"Documentary,War",FR,Que la lumière soit
3,tt0048550,Rendez-vous of the Docks,Le rendez-vous des quais,1990,75,Drama,FR,Le rendez-vous des quais
4,tt0059325,Born in '45,Jahrgang 45,1990,100,"Drama,Romance",FR,Génération 45


In [36]:
# we create a directors dataframe by filtering principals an merging with namebasics to get director names
# we merge with movies to keep only directors of our movies dataset
directors = principals[principals['category'] == 'director']
directors = pd.merge(directors, namebasics, on='nconst', how='left')
directors = directors[['tconst', 'nconst', 'primaryName', 'birthYear', 'deathYear']]
directors = pd.merge(directors, movies[['tconst']], on='tconst', how='inner')


In [38]:
# we create a actors_actresses dataframe by filtering principals an merging with namebasics to get actor/actress names
# we merge with movies to keep only actors/actresses of our movies dataset
actors = principals[principals['category'].isin(['actor', 'actress'])]
actors = pd.merge(actors, namebasics, on='nconst', how='left')
actors = actors[['tconst', 'nconst', 'category', 'primaryName', 'birthYear', 'deathYear']]
actors = pd.merge(actors, movies[['tconst']], on='tconst', how='inner')

## **III) EXPORT FOR VISUALIZATION** 

In [40]:
export_path = "C://Users/barba/Case_studies/Cinema_recommender/cleaned_data"
for df, name in zip([movies, ratings, directors, actors], 
                    ['movies', 'ratings', 'directors', 'actors']):
    file_path = os.path.join(export_path, f"{name}.csv")
    df.to_csv(file_path, index=False)  