# Clean movies metadata


## 1. Setup


In [2]:
import sys
sys.path.append('..')


In [3]:
import pandas as pd
import ast


from lib.types.dataset_type import DatasetType
from lib.types.source_type import SourceType

source: SourceType = SourceType.original


In [4]:
df = pd.read_csv(DatasetType.movies_metadata.path(source))
df['id'] = df['id'].astype(str)
df.head(1)


  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0


In [5]:
df.columns.tolist()


['adult',
 'belongs_to_collection',
 'budget',
 'genres',
 'homepage',
 'id',
 'imdb_id',
 'original_language',
 'original_title',
 'overview',
 'popularity',
 'poster_path',
 'production_companies',
 'production_countries',
 'release_date',
 'revenue',
 'runtime',
 'spoken_languages',
 'status',
 'tagline',
 'title',
 'video',
 'vote_average',
 'vote_count']

## 2. Extract genres and belongs to collection


In [6]:

def apply_genre(row: pd.Series):
    genres = row['genres']

    parsed_json = ast.literal_eval(genres)

    genre_ids = "|".join(([str(i['id']) for i in parsed_json]))
    genre_names = "|".join(([i['name'] for i in parsed_json]))

    row['genre_ids'] = genre_ids
    row['genre_names'] = genre_names

    btc: str = row['belongs_to_collection']
    if (btc != None):
        try:
            parsed_json_btc = ast.literal_eval(btc)
            btc_id = parsed_json_btc['id']
            btc_name = parsed_json_btc['name']
            row['btc_name'] = btc_name
            row['btc_id'] = str(btc_id)
        except:
            row['btc_name'] = None
            row['btc_id'] = None

    return row


cleaned_df = df.apply(lambda row: apply_genre(row), axis=1)
cleaned_df.drop(
    labels=[
        'adult',
        'belongs_to_collection',
        # 'budget',
        'genres',
        'homepage',
        # 'id',
        'imdb_id',
        'original_language',
        'original_title',
        'overview',
        # 'popularity',
        'poster_path',
        'production_companies',
        'production_countries',
        'release_date',
        # 'revenue',
        'runtime',
        'spoken_languages',
        'status',
        'tagline',
        # 'title',
        'video',
        # 'vote_average',
        # 'vote_count',
    ],
    axis=1,
    inplace=True
)


## 3. Find average rating


In [7]:
rating_df = pd.read_csv(DatasetType.ratings_small.path(source))
rating_df.rename(columns={'userId': 'user_id', 'movieId': 'id'}, inplace=True)

avg_rating_df: pd.DataFrame = rating_df.groupby('id').agg(
    avg_rating=('rating', 'mean'),
    quantity=('user_id', 'count')
).reset_index()

avg_rating_df['id'] = avg_rating_df['id'].astype(str)
avg_rating_df


Unnamed: 0,id,avg_rating,quantity
0,1,3.872470,247
1,2,3.401869,107
2,3,3.161017,59
3,4,2.384615,13
4,5,3.267857,56
...,...,...,...
9061,161944,5.000000,1
9062,162376,4.500000,1
9063,162542,5.000000,1
9064,162672,3.000000,1


## 4. Merge avg_rating_df with clean_df


In [8]:
avg_rating_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9066 entries, 0 to 9065
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          9066 non-null   object 
 1   avg_rating  9066 non-null   float64
 2   quantity    9066 non-null   int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 212.6+ KB


In [9]:
cleaned_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   budget        45466 non-null  object 
 1   id            45466 non-null  object 
 2   popularity    45461 non-null  object 
 3   revenue       45460 non-null  float64
 4   title         45460 non-null  object 
 5   vote_average  45460 non-null  float64
 6   vote_count    45460 non-null  float64
 7   genre_ids     45466 non-null  object 
 8   genre_names   45466 non-null  object 
 9   btc_name      4491 non-null   object 
 10  btc_id        4491 non-null   object 
dtypes: float64(3), object(8)
memory usage: 3.8+ MB


In [10]:
movie_df_cleaned = pd.merge(
    left=avg_rating_df, right=cleaned_df, on='id')

movie_df_cleaned.head(3)


Unnamed: 0,id,avg_rating,quantity,budget,popularity,revenue,title,vote_average,vote_count,genre_ids,genre_names,btc_name,btc_id
0,2,3.401869,107,0,3.860491,0.0,Ariel,7.1,44.0,18|80,Drama|Crime,,
1,3,3.161017,59,0,2.29211,0.0,Shadows in Paradise,7.1,35.0,18|35,Drama|Comedy,,
2,5,3.267857,56,4000000,9.026586,4300000.0,Four Rooms,6.5,539.0,80|35,Crime|Comedy,,


## 5. Write to CSV


In [11]:
from lib.utils.utils import write_csv

write_csv(DatasetType.movies_metadata.cleaned_path(), cleaned_df)
