In [None]:
import pandas as pd
from datetime import date
from pyprojroot import here

In [None]:
def prepare_movie_title_tsv_tocsv(fileName):
    df_titles = pd.read_csv(here('data/' + fileName), sep='\t', index_col='tconst', usecols=[
        'tconst', 'titleType', 
        'originalTitle', 'isAdult', 
        'startYear', 'runtimeMinutes', 
        'genres'],
        dtype={
        'tconst' : 'string',
        'titleType' : 'string',
        'originalTitle' : 'string',
        'isAdult' : 'string',
        'startYear':'string',
        'runtimeMinutes' : 'string',
        'genres' : 'string'

    })
    df_titles = df_titles[(df_titles.titleType == "movie")]
    df_titles["isAdult"] = df_titles["isAdult"].replace("\\N", "")
    df_titles["genres"] = df_titles["genres"].replace("\\N", "")
    df_titles["runtimeMinutes"] = df_titles["runtimeMinutes"].replace("\\N", "")
    df_titles["runtimeMinutes"] = pd.to_numeric(df_titles['runtimeMinutes'], errors='coerce')
    df_titles["startYear"] = pd.to_datetime(df_titles['startYear'], errors='coerce')

    return df_titles

In [None]:
def save_movie_title_to_csv(movie_title, fileName):
    movie_title.to_csv(here("data\\" + fileName))

In [None]:
df_titles = prepare_movie_title_tsv_tocsv("title-basics.tsv")
df_titles.head(10)

In [None]:
df_titles.info()

In [None]:
save_movie_title_to_csv(df_titles, "title-movies-filtered.csv")

In [None]:
df_titles = pd.read_csv(here('data/title-movies-filtered.csv'), dtype={
    'tconst': 'string',
    'titleType': 'string',
    'primaryTitle': 'string',
    'originalTitle': 'string',
    'isAdult': 'boolean',
    'startYear': 'string',
    'endYear': 'string',
    'runtimeMinutes': 'float64',
    'genres': 'string'
})


In [None]:
df_titles.startYear = df_titles.startYear.astype("datetime64[ns]")

In [None]:
df_titles.info()

In [None]:
df_titles = df_titles.sort_values(by="startYear", ascending=False)
df_titles.head(10)
df_titles.info()

In [None]:

df_titles_2010onwards = df_titles[(df_titles.startYear.dt.date > date(2020, 1, 1)) & (df_titles.startYear.dt.date <= date.today())]
df_titles_2010onwards.info()

In [None]:
df_titles = df_titles[(df_titles.startYear.dt.date > date(2015, 1, 1))]

In [None]:
df_titles

In [None]:
df_title_akas=pd.read_csv(here('data/title-akas.tsv'), sep='\t', dtype={
    'titleId': 'string',
    'ordering': 'int64',
    'title': 'string',
    'region': 'string',
    'language': 'string',
    'types': 'string',
    'attributes': 'string',
    'isOriginalTitle': 'bool'
})

In [None]:
df_title_akas.head(10)

In [None]:
df_title_akas.info()

In [None]:
df_title_akas_movies = df_title_akas[(df_title_akas.titleId.isin(df_titles["tconst"]))]

df_title_akas_movies

In [16]:
df_title_akas_movies_en = df_title_akas_movies[((df_title_akas_movies.language == "EN") | (df_title_akas_movies.language == "en") | (df_title_akas_movies.language == "En"))]
df_title_akas_movies_en

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
2323,tt0000574,4,The Story of the Kelly Gang,SG,en,imdbDisplay,\N,False
3121,tt0000838,4,The Cultivation of the Cacao Tree,XWW,en,\N,literal English title,False
3406,tt0000941,3,Love Crazy,XWW,en,\N,informal literal title,False
3736,tt0001051,3,Magical Dream,XWW,en,\N,literal English title,False
3938,tt0001122,4,The Red Inn,XWW,en,\N,\N,False
...,...,...,...,...,...,...,...,...
50193507,tt9916362,13,Coven of Sisters,XWW,en,alternative,\N,False
50193523,tt9916362,3,Coven,CA,en,imdbDisplay,\N,False
50193524,tt9916362,4,Coven,EG,en,imdbDisplay,\N,False
50193636,tt9916428,4,The Secret of China,XWW,en,imdbDisplay,\N,False


In [None]:
df_title_akas_movies_en_US_GB = df_title_akas_movies_en[
    (df_title_akas_movies_en.region == 'US') | 
    (df_title_akas_movies_en.region == 'GB') 
    ]

df_title_US_GB = df_title_akas_movies_en_US_GB.sort_values(by= "title")
#df_title_US_GB.head(50)

df_title_US_GB['Counts'] = df_title_US_GB.groupby(['titleId'])['title'].transform('count')
df_title_US_GB = df_title_US_GB.sort_values(by=['Counts', 'titleId'], ascending=False)

#df_title_US_GB.info()
#df_title_US_GB.titleId.value_counts()
#df_title_orignalTitle.region.unique()

#df_title_US_GB[((df_title_US_GB.titleId.value_counts() > 1) = True)]
df_title_US_GB.head(50)

#df_title_orignalTitle_dup = df_title_orignalTitle[(df_title_orignalTitle.title.duplicated() > 1)]
#df_title_orignalTitle_dup

In [17]:
df_titles_EN = df_titles[(df_titles.tconst.isin(df_title_akas_movies_en["titleId"]))]
df_titles_EN.info()

<class 'pandas.core.frame.DataFrame'>
Index: 178238 entries, 604057 to 695308
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   tconst          178238 non-null  string        
 1   titleType       178238 non-null  string        
 2   originalTitle   178238 non-null  string        
 3   isAdult         178238 non-null  boolean       
 4   startYear       176221 non-null  datetime64[ns]
 5   runtimeMinutes  155660 non-null  float64       
 6   genres          168921 non-null  string        
dtypes: boolean(1), datetime64[ns](1), float64(1), string(4)
memory usage: 9.9 MB


In [24]:
df_titles_EN_DATEFILTER = df_titles_EN[(df_titles_EN.startYear.dt.date > date(2015, 1, 1)) & (df_titles_EN.startYear.dt.date <= date.today())]
df_titles_EN_DATEFILTER.info()

<class 'pandas.core.frame.DataFrame'>
Index: 53784 entries, 485223 to 613697
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   tconst          53784 non-null  string        
 1   titleType       53784 non-null  string        
 2   originalTitle   53784 non-null  string        
 3   isAdult         53784 non-null  boolean       
 4   startYear       53784 non-null  datetime64[ns]
 5   runtimeMinutes  49379 non-null  float64       
 6   genres          52963 non-null  string        
dtypes: boolean(1), datetime64[ns](1), float64(1), string(4)
memory usage: 3.0 MB


In [19]:
title_principals = df_principals =pd.read_csv(here('data/title-principals.tsv'), sep='\t', dtype={
    'tconst': 'string',
    'ordering': 'int64',
    'nconst': 'string',
    'category': 'string',
    'job': 'string'
})

In [25]:
title_principals_ENMovie= title_principals[(title_principals.tconst.isin(df_titles_EN_DATEFILTER['tconst']))]
title_principals_ENMovie.info()

<class 'pandas.core.frame.DataFrame'>
Index: 870160 entries, 836220 to 88714069
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   tconst      870160 non-null  string
 1   ordering    870160 non-null  int64 
 2   nconst      870160 non-null  string
 3   category    870160 non-null  string
 4   job         870160 non-null  string
 5   characters  870160 non-null  object
dtypes: int64(1), object(1), string(4)
memory usage: 46.5+ MB


In [26]:
title_principals_ENMovie.category.unique()

<StringArray>
[              'actor',             'actress',            'director',
              'writer',            'producer',            'composer',
     'cinematographer',              'editor', 'production_designer',
    'casting_director',                'self',     'archive_footage',
       'archive_sound']
Length: 13, dtype: string

In [28]:
title_principals_ENMovie_CAT= title_principals_ENMovie[(title_principals_ENMovie.category.isin(['actor', 'actress', 'director', 'writer', 'producer']))]
title_principals_ENMovie_CAT.info()

<class 'pandas.core.frame.DataFrame'>
Index: 632967 entries, 836220 to 88714069
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   tconst      632967 non-null  string
 1   ordering    632967 non-null  int64 
 2   nconst      632967 non-null  string
 3   category    632967 non-null  string
 4   job         632967 non-null  string
 5   characters  632967 non-null  object
dtypes: int64(1), object(1), string(4)
memory usage: 33.8+ MB
