In [None]:
import pandas as pd
import pandas as pd
import numpy as np
from datetime import datetime
import re

In [None]:
import sys
sys.path.append('../utils')
import functions

In [None]:
dtype_dict = {
    'tconst': str,
    'titleType': str,
    'primaryTitle': str,
    'originalTitle': str,
    'isAdult': 'Int64',
    'startYear': 'Int64',
    'endYear': 'Int64',
    'runtimeMinutes': str,
    'genres': str
}

title_basics = pd.read_csv('../data/local/title.basics.tsv', sep='\t', dtype=dtype_dict, na_values=['\\N', ''], low_memory=True)

In [None]:
title_basics.head()

## Handle Columns
- Drop columns
- Standardize and change column names

In [None]:
title_basics = title_basics.drop(columns=['endYear', 'isAdult'])

In [None]:
title_basics = functions.standardize_column_names(title_basics)

new_column_names = {
    'title_type': 'type',
    'primary_title': 'title',
    'original_title': 'original_title',
    'start_year': 'year',
    'runtime_minutes': 'runtime',
    'genres': 'genre'
}

title_basics.rename(columns=new_column_names, inplace=True)
title_basics.head()

## Handle Rows

Remove the following genres: Talk-Show, Reality-TV, News, Game-Show, Short, Adult

In [None]:
# genres to remove
genres_to_remove = ['Talk-Show', 'Reality-TV', 'News', 'Game-Show', 'Short', 'Adult']

rows_before = len(title_basics)

title_basics = title_basics[~title_basics['genre'].str.contains('|'.join(genres_to_remove), case=False, na=False)]

rows_after = len(title_basics)
rows_deleted = rows_before - rows_after

print(f'Number of rows deleted: {rows_deleted}')

In [None]:
title_basics.head()

Keep 'movie' and 'tv movie' types

In [None]:
print(title_basics['type'].unique())

# filter types, remove rows that don't match
valid_title_types = ['movie', 'tvMovie']

title_basics = title_basics[title_basics['type'].isin(valid_title_types)]

Generate 'clean_title' column

In [None]:
def clean_movie_titles(df, title_col, clean_col):
    def clean_title(title):
        if not isinstance(title, str):  # Check if the title is a string
            return None
        # Remove brackets and parentheses
        title = re.sub(r'[\[\]\(\)\{\}]', '', title)
        # Preserve Latin characters, spaces, and alphanumerics while removing other special characters
        title = re.sub(r'[^\w\sàáâäãåçèéêëìíîïñòóôöõùúûüýÿÀÁÂÄÃÅÇÈÉÊËÌÍÎÏÑÒÓÔÖÕÙÚÛÜÝ]', '', title)
        # Normalize whitespace
        title = re.sub(r'\s+', ' ', title).strip()
        return title

    df[clean_col] = df[title_col].apply(clean_title)
    return df

clean_movie_titles(title_basics, 'title', 'clean_title')

In [None]:
def clean_titles(df, column_name):
    def clean_title(title):
        if isinstance(title, str):  # Check if the title is a string
            title = title.strip()  # Remove leading/trailing spaces
            title = re.sub(r'\s+', ' ', title)  # Replace multiple spaces with a single space
            # Preserve Latin characters and spaces while removing other special characters
            title = re.sub(r'[^\w\sàáâäãåçèéêëìíîïñòóôöõùúûüýÿÀÁÂÄÃÅÇÈÉÊËÌÍÎÏÑÒÓÔÖÕÙÚÛÜÝ]', '', title)
            title = title.lower()  # Convert the title to lowercase
            return title
        return None  # Return None for non-string values
    
    # Apply the clean_title function to the specified column
    return df[column_name].apply(clean_title)

title_basics['clean_title'] = clean_titles(title_basics, 'title')

In [None]:
title_basics.head()

Get duplicated rows based on the 'title' column and drop repeated instances

In [None]:
title_basics = functions.clean_and_remove_duplicates(title_basics, column_name='clean_title')

title_basics.head()


In [None]:
title_basics = functions.clean_and_remove_duplicates(title_basics, column_name='original_title')

In [None]:
title_basics.head()


Drop 'type' column

In [None]:
title_basics = title_basics.drop(columns=['type'])

In [None]:
functions.show_basic_info(title_basics)

Remove rows with future years

In [None]:
title_basics = functions.filter_future_years(title_basics, 'year')

Genres to lowcaps, separate with commas

In [None]:
def clean_genres(df, column_name):
    def clean_genre(genre):
        if isinstance(genre, str):  # Check if the genre is a string
            genre = genre.lower()  # Convert to lowercase
            genre = re.sub(r',\s*', ', ', genre)  # Ensure a space after commas
            return genre
        return None  # Return None for non-string values
    
    # Apply the clean_genre function to the specified column
    return df[column_name].apply(clean_genre)

title_basics['genre'] = clean_genres(title_basics, 'genre')
title_basics.head(20)

Sort DF by year, reset index

In [None]:
title_basics = title_basics.sort_values(by='year').reset_index(drop=True)

print('\nSorted DataFrame with reset index:')
print(title_basics)

Create titles csv

In [None]:
title_basics.to_csv('../data/clean/imdb_titles.csv', index=False)

## Get ratings from `title.ratings.tsv`

In [None]:
title_ratings = pd.read_csv('../data/local/title.ratings.tsv', sep='\t')

In [None]:
title_ratings.head()

In [None]:
functions.show_basic_info(title_ratings)

Merge on 'tconst'

In [None]:

# Merge df1 and df2 on the 'tconst' column
merged_df = pd.merge(title_basics, title_ratings[['tconst', 'averageRating', 'numVotes']], on='tconst', how='left')

# Display the merged DataFrame
print(merged_df.head())

In [None]:
functions.show_basic_info(merged_df)

In [None]:
merged_df['numVotes'] = merged_df['numVotes'].fillna(0).astype(int)

Rename columns

In [None]:
# Rename columns in merged_df
merged_df = merged_df.rename(columns={
    'tconst' : 'title_id',
    'averageRating': 'rating',
    'numVotes': 'votes'
})

# Display the updated DataFrame
print(merged_df.head())

Create .csv file

In [None]:
merged_df.to_csv('../data/clean/imdb_titles_ratings.csv', index=False)