In [52]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

## Create DataFrame and CSV of Movie Titles

In [54]:
file_path = 'Resources/Data_from_web/title.basics.tsv'
movie_titles_df = pd.read_csv(file_path, sep='\t')

print(movie_titles_df.columns)
movie_titles_df.head(2)

Index(['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult',
       'startYear', 'endYear', 'runtimeMinutes', 'genres'],
      dtype='object')


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"


In [55]:
movie_titles_clean = movie_titles_df[['tconst', 'titleType', 'primaryTitle', 'isAdult', 
                                      'startYear', 'genres']]

# Get rid of '\N' values in startYear
movie_titles_clean = movie_titles_clean[movie_titles_clean['startYear'] != r'\N']

# need to convert type to int & string
movie_titles_clean = movie_titles_clean.astype({'startYear': int,
                                                'titleType': 'string'})

# filter language based on Englsih 2015-2019
movie_titles_clean = movie_titles_clean[(movie_titles_clean.startYear > 2014) & 
                                        (movie_titles_clean.startYear < 2020)]

# weed out adult  films & narrow down results to movies
movie_titles_clean = movie_titles_clean[movie_titles_clean['titleType'] == 'movie']
movie_titles_clean = movie_titles_clean[movie_titles_clean['isAdult'] == 0]

# remove any potential duplicates (tconst column)
movie_titles_clean = movie_titles_clean.drop_duplicates(subset=['tconst'])


In [56]:
print(movie_titles_clean.dtypes)
print(f"\nTHIS IS HOW MANY MOVIES: {movie_titles_clean.startYear.count()}")
movie_titles_clean.head()

tconst          object
titleType       string
primaryTitle    object
isAdult          int64
startYear        int64
genres          object
dtype: object

THIS IS HOW MANY MOVIES: 84294


Unnamed: 0,tconst,titleType,primaryTitle,isAdult,startYear,genres
67687,tt0069049,movie,The Other Side of the Wind,0,2018,Drama
70840,tt0072306,movie,Toula ou Le génie des eaux,0,2017,Drama
76043,tt0077649,movie,Isolation of 1/8800000,0,2018,\N
98079,tt0100275,movie,The Wandering Soap Opera,0,2017,"Comedy,Drama,Fantasy"
109002,tt0111414,movie,A Thin Life,0,2018,Comedy


In [57]:
file_outpath = "Resources/movie_title.csv"

movie_titles_clean.to_csv(file_outpath)