# Handling IMDB Data
- Clean IMDB datasets
- Merge `title.basics` and `title.ratings` to obtain movie titles, genres, and ratings.
- Generate new .csv files

In [None]:
import pandas as pd
import pandas as pd
import numpy as np
from datetime import datetime
import re

In [None]:
import sys
sys.path.append('../utils')
import functions

In [None]:
dtype_dict = {
    'tconst': str,
    'titleType': str,
    'primaryTitle': str,
    'originalTitle': str,
    'isAdult': 'Int64',
    'startYear': 'Int64',
    'endYear': 'Int64',
    'runtimeMinutes': str,
    'genres': str
}

title_basics = pd.read_csv('../data/local/raw/title.basics.tsv', sep='\t', dtype=dtype_dict, na_values=['\\N', ''], low_memory=True)

In [None]:
title_basics.head()

## Handle Columns
- Drop columns
- Standardize and change column names

In [None]:
title_basics = title_basics.drop(columns=['endYear', 'isAdult'])

In [None]:
title_basics = functions.standardize_column_names(title_basics)

new_column_names = {
    'title_type': 'type',
    'primary_title': 'title',
    'original_title': 'original_title',
    'start_year': 'year',
    'runtime_minutes': 'runtime',
    'genres': 'genre'
}

title_basics.rename(columns=new_column_names, inplace=True)
title_basics.head()

## Handle Rows

Remove the following genres: Talk-Show, Reality-TV, News, Game-Show, Short, Adult

In [None]:
# genres to remove
genres_to_remove = ['Talk-Show', 'Reality-TV', 'News', 'Game-Show', 'Short', 'Adult']

rows_before = len(title_basics)

title_basics = title_basics[~title_basics['genre'].str.contains('|'.join(genres_to_remove), case=False, na=False)]

rows_after = len(title_basics)
rows_deleted = rows_before - rows_after

print(f'Number of rows deleted: {rows_deleted}')

In [None]:
title_basics.head()

Keep 'movie' and 'tv movie' types

In [None]:
print(title_basics['type'].unique())

# filter types, remove rows that don't match
valid_title_types = ['movie', 'tvMovie']

title_basics = title_basics[title_basics['type'].isin(valid_title_types)]

Generate 'clean_title' column

In [None]:
title_basics['clean_title'] = functions.prepare_clean_titles(title_basics, 'title')

In [None]:
title_basics.head()

Get duplicated rows based on the 'title' column and drop repeated instances

In [None]:
title_basics = functions.clean_and_remove_duplicates(title_basics, column_name='clean_title')

title_basics.head()

In [None]:
title_basics = functions.clean_and_remove_duplicates(title_basics, column_name='original_title')

In [None]:
title_basics.head()

Drop 'type' column

In [None]:
title_basics = title_basics.drop(columns=['type'])

In [None]:
functions.show_basic_info(title_basics)

Remove rows with future years

In [None]:
title_basics = functions.filter_future_years(title_basics, 'year')

Genres to lowcaps, separate with commas

In [None]:
title_basics['genre'] = functions.clean_genres(title_basics, 'genre')
title_basics.head(20)

Sort DF by year, reset index

In [None]:
title_basics = title_basics.sort_values(by='year').reset_index(drop=True)

print('\nSorted DataFrame with reset index:')
print(title_basics)

Create titles csv

In [None]:
title_basics.to_csv('../data/local/clean/imdb_titles.csv', index=False)

## Get ratings from `title.ratings.tsv`

In [None]:
title_ratings = pd.read_csv('../data/local/raw/title.ratings.tsv', sep='\t')

In [None]:
title_ratings.head()

In [None]:
functions.show_basic_info(title_ratings)

Merge on 'tconst'

In [None]:
merged_df = pd.merge(title_basics, title_ratings[['tconst', 'averageRating', 'numVotes']], on='tconst', how='left')

merged_df.head()

In [None]:
functions.show_basic_info(merged_df)

In [None]:
merged_df['numVotes'] = merged_df['numVotes'].fillna(0).astype(int)

Rename columns

In [None]:
merged_df = merged_df.rename(columns={
    'tconst' : 'title_id',
    'averageRating': 'rating',
    'numVotes': 'votes'
})

merged_df.head()

Create .csv file

In [None]:
merged_df.to_csv('../data/local/clean/imdb_titles_ratings.csv', index=False)