In [1]:
# Step 0. Load libraries and custom functions
# Matrices and datasets ------------------------------------------------
import pandas as pd
import numpy as np
# Graphics -------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns
# Text processors
import re
import string
#import nltk
#from nltk.corpus import stopwords
#nltk.download('stopwords')
from wordcloud import WordCloud
# Machine Learning -----------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import CountVectorizer
# Deep Learning --------------------------------------------------------
import keras
import tensorflow as tf
from keras import layers
from keras.layers import TextVectorization
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [2]:
# Step 1. Load data
# 1.1 Read csv and get basic info
df_raw = pd.read_csv('../data/02_TMDB_5000_movies.csv')
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [3]:
# 1.2 Get a sample
df_raw.sample(10, random_state=2024)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
2182,0,"[{""id"": 16, ""name"": ""Animation""}, {""id"": 10751...",,13682,[],en,Pooh's Heffalump Movie,Who or what exactly is a Heffalump? The lovabl...,9.03154,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2005-02-11,0,68.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,There's something new in the Hundred Acre Wood.,Pooh's Heffalump Movie,6.4,88
3274,8000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 53, ""nam...",,13154,"[{""id"": 1794, ""name"": ""yakuza""}, {""id"": 12670,...",en,Showdown in Little Tokyo,"An American with a Japanese upbringing, Chris ...",8.403859,"[{""name"": ""Original Pictures"", ""id"": 4234}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",1991-08-23,2275557,79.0,"[{""iso_639_1"": ""ja"", ""name"": ""\u65e5\u672c\u8a...",Released,One's a warrior. One's a wise guy. They're two...,Showdown in Little Tokyo,5.7,95
1003,49000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 35, ""nam...",,9548,"[{""id"": 578, ""name"": ""rock and roll""}, {""id"": ...",en,The Adventures of Ford Fairlane,"Ford ""Mr. Rock n' Roll Detective"" Fairlane is ...",2.808428,"[{""name"": ""Twentieth Century Fox Film Corporat...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",1990-07-11,20423389,104.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Kojak. Columbo. Dirty Harry. Wimps.,The Adventures of Ford Fairlane,6.2,71
1383,32000000,"[{""id"": 18, ""name"": ""Drama""}]",,13920,"[{""id"": 5565, ""name"": ""biography""}, {""id"": 605...",en,Radio,"High school football coach, Harold Jones befri...",9.254647,"[{""name"": ""Revolution Studios"", ""id"": 497}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2003-10-24,52277485,109.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,His courage made them champions.,Radio,6.8,141
2724,18339750,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 36, ""name...",http://www.downfallthefilm.com/,613,"[{""id"": 220, ""name"": ""berlin""}, {""id"": 351, ""n...",de,Der Untergang,"In April of 1945, Germany stands at the brink ...",32.445895,"[{""name"": ""Degeto Film"", ""id"": 986}, {""name"": ...","[{""iso_3166_1"": ""AT"", ""name"": ""Austria""}, {""is...",2004-09-08,92180910,156.0,"[{""iso_639_1"": ""hu"", ""name"": ""Magyar""}, {""iso_...",Released,"April 1945, a nation awaits its...Downfall",Downfall,7.7,1037
3340,7000000,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 10749, ""n...",,713,"[{""id"": 128, ""name"": ""love triangle""}, {""id"": ...",en,The Piano,"After a long voyage from Scotland, pianist Ada...",17.681707,"[{""name"": ""New South Wales Film & Television O...","[{""iso_3166_1"": ""NZ"", ""name"": ""New Zealand""}, ...",1993-05-19,116700000,121.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,,The Piano,7.1,281
463,0,"[{""id"": 10749, ""name"": ""Romance""}, {""id"": 18, ...",,161795,"[{""id"": 9673, ""name"": ""love""}, {""id"": 14638, ""...",en,Déjà Vu,L.A. shop owner Dana and Englishman Sean meet ...,0.605645,"[{""name"": ""Rainbow Film Company, The"", ""id"": 2...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",1998-04-22,0,117.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Your future is set...,Déjà Vu,8.0,1
4168,0,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 53, ""name...",,356987,"[{""id"": 230912, ""name"": ""supervivencia""}]",en,Abandoned,When their yacht capsizes during a storm; four...,3.068463,"[{""name"": ""Making Movies"", ""id"": 71702}]","[{""iso_3166_1"": ""NZ"", ""name"": ""New Zealand""}]",2015-08-30,0,82.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,Abandoned,5.8,27
4057,2160000,"[{""id"": 18, ""name"": ""Drama""}]",,43610,[],en,The Valley of Decision,Mary Rafferty comes from a poor family of stee...,0.1813,"[{""name"": ""Metro-Goldwyn-Mayer (MGM)"", ""id"": 8...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",1945-06-01,9132000,119.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,The Valley of Decision,5.8,4
4456,800000,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 10749, ""n...",http://www.lhp.com.sg/victor/,25461,"[{""id"": 10183, ""name"": ""independent film""}]",en,Raising Victor Vargas,"The film follows Victor, a Lower East Side tee...",3.643662,[],"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2002-05-16,2816116,88.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,,Raising Victor Vargas,7.8,13


### References
[1] https://github.com/PhilChodrow/PIC16B/blob/7d12d32e070e7ff3840b971c0ce4185ef1911796/discussion/tmdb.ipynb#L758