### Get overview, taglines and countries from TMBD raw dataset, add them to clean df, rename it, generate new file.

In [1]:
import pandas as pd

In [2]:
import sys
sys.path.append('../utils')
import data_cleaning
import data_inspection
import helpers

In [3]:
tmdb_clean = pd.read_csv('../data/clean/clean_films_id.csv')
tmdb_raw = pd.read_csv('../data/local/raw/TMDB_released_movies.csv')

In [None]:
data_inspection.show_basic_info(tmdb_raw)

In [None]:
tmdb_clean.head(3)

In [None]:
tmdb_raw.head(3)

#### Get overview, tagline and production countries

In [None]:
# Merge tmdb_clean with tmdb_raw based on the 'tmdb_id' (from tmdb_clean) and 'id' (from tmdb_raw)
tmdb_clean = tmdb_clean.merge(
    tmdb_raw[['tmdb_id', 'production_countries', 'overview', 'tagline']], 
    left_on='tmdb_id', 
    right_on='tmdb_id', 
    how='left',
)

# If there are multiple values for 'production_countries' (they could be lists), concatenate them with a comma
tmdb_clean['production_countries'] = tmdb_clean.groupby('tmdb_id')['production_countries'].transform(lambda x: ', '.join(x.astype(str)))

# Now check if there are any missing values after filling
missing_ids = tmdb_clean[tmdb_clean['production_countries'].isna()]['tmdb_id']

# Print missing IDs (if any)
if not missing_ids.empty:
    print(f'IDs not found in tmdb_raw: {missing_ids.tolist()}')
else:
    print('All IDs from tmdb_clean were found in tmdb_raw.')

print(tmdb_clean.head())


In [None]:
data_inspection.show_missing_values(tmdb_clean)

In [9]:
# rename column
tmdb_clean.rename(columns={'production_countries': 'countries'}, inplace=True)

In [10]:
# reorder columns
new_column_order = [
    'tmdb_id', 'imdb_id', 'doesthedog_id',
    'title', 'original_title', 'genres', 'director', 'release_year',
    'runtime', 'budget', 'revenue', 'profit', 'popularity',
    'tmdb_rating', 'tmdb_votes', 'imdb_rating', 'imdb_votes',
    'language', 'countries', 'overview', 'tagline',
    'events', 'has_warnings'
]

tmdb_clean = tmdb_clean[new_column_order]


In [11]:
tmdb_clean = tmdb_clean.sort_values(by='tmdb_id')

tmdb_clean.reset_index(drop=True, inplace=True)

# tmdb_clean.to_csv('../data/clean/tmdb_clean_films.csv', index=False)