In [1]:
import pandas as pd
import polars as pl
import os

In [2]:
# Paths
IMDB_PATH = '../data/raw/imdb'
TMDB_PATH = '../data/raw/tmdb'
RAW_PARQUET_PATH = '../data/raw_parquet'
PROCESSED_PATH = '../data/processed'

In [3]:
# Make sure output folders exist
os.makedirs(RAW_PARQUET_PATH, exist_ok=True)
os.makedirs(PROCESSED_PATH, exist_ok=True)

imdb_files = {
    "basics": "title.basics.tsv",
    "akas": "title.akas.tsv",
    "crew": "title.crew.tsv",
    "episode": "title.episode.tsv",
    "principals": "title.principals.tsv",
    "ratings": "title.ratings.tsv",
    "names": "name.basics.tsv"
}

In [5]:
# Read entire files
tmdb_data = pd.read_csv(os.path.join(TMDB_PATH, 'TMDB_movie_dataset_v11.csv'))

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,Inception,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc..."
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,...,Interstellar,The adventures of a group of explorers who mak...,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,..."
2,155,The Dark Knight,8.512,30619,Released,2008-07-16,1004558444,152,False,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg,...,The Dark Knight,Batman raises the stakes in his war on crime. ...,130.643,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,Welcome to a world without rules.,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin","joker, sadism, chaos, secret identity, crime f..."
3,19995,Avatar,7.573,29815,Released,2009-12-15,2923706026,162,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,...,Avatar,"In the 22nd century, a paraplegic Marine is di...",79.932,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,Enter the world of Pandora.,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish","future, society, culture clash, space travel, ..."
4,24428,The Avengers,7.71,29166,Released,2012-04-25,1518815515,143,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,...,The Avengers,When an unexpected enemy emerges and threatens...,98.082,/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg,Some assembly required.,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian","new york city, superhero, shield, based on com..."


In [6]:
tmdb_data.head()

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,Inception,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc..."
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,...,Interstellar,The adventures of a group of explorers who mak...,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,..."
2,155,The Dark Knight,8.512,30619,Released,2008-07-16,1004558444,152,False,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg,...,The Dark Knight,Batman raises the stakes in his war on crime. ...,130.643,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,Welcome to a world without rules.,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin","joker, sadism, chaos, secret identity, crime f..."
3,19995,Avatar,7.573,29815,Released,2009-12-15,2923706026,162,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,...,Avatar,"In the 22nd century, a paraplegic Marine is di...",79.932,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,Enter the world of Pandora.,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish","future, society, culture clash, space travel, ..."
4,24428,The Avengers,7.71,29166,Released,2012-04-25,1518815515,143,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,...,The Avengers,When an unexpected enemy emerges and threatens...,98.082,/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg,Some assembly required.,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian","new york city, superhero, shield, based on com..."


In [14]:
try:
    imdb_basics = pd.read_csv(
        os.path.join(IMDB_PATH, imdb_files['basics']),
        sep='\t',  # Confirm the delimiter if it's a TSV file
        low_memory=False,
        na_values=['\\N'],  # Handle '\N' as NaN
        on_bad_lines='skip'  # Skip over lines with too many fields
    )
    print(imdb_basics.head())

except Exception as e:
    print(f"An error occurred: {e}")

      tconst titleType            primaryTitle           originalTitle  \
0  tt0000001     short              Carmencita              Carmencita   
1  tt0000002     short  Le clown et ses chiens  Le clown et ses chiens   
2  tt0000003     short            Poor Pierrot          Pauvre Pierrot   
3  tt0000004     short             Un bon bock             Un bon bock   
4  tt0000005     short        Blacksmith Scene        Blacksmith Scene   

   isAdult  startYear  endYear runtimeMinutes                    genres  
0      0.0     1894.0      NaN              1         Documentary,Short  
1      0.0     1892.0      NaN              5           Animation,Short  
2      0.0     1892.0      NaN              5  Animation,Comedy,Romance  
3      0.0     1892.0      NaN             12           Animation,Short  
4      0.0     1893.0      NaN              1                     Short  


In [None]:
imdb_basics.head()

In [None]:
# Function to process each chunk
def process_chunk(chunk):
    # Process or transform data here
    return chunk

# Initialize an empty DataFrame for the results
results = pd.DataFrame()

# Read and process in chunks
for chunk in pd.read_csv(os.path.join(TMDB_PATH, 'TMDB_movie_dataset_v11.csv'), chunksize=50000):
    processed_chunk = process_chunk(chunk)
    results = pd.concat([results, processed_chunk])

# You might join after concatenating all chunks or within the loop depending on the size and nature of the join.

In [8]:
# Read data using Polars
tmdb_data = pl.read_csv(os.path.join(TMDB_PATH, 'TMDB_movie_dataset_v11.csv'), separator='\t', low_memory=False)

In [9]:
tmdb_data.head()

id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,budget,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
i64,str,f64,i64,str,str,i64,i64,bool,str,i64,str,str,str,str,str,f64,str,str,str,str,str,str,str
27205,"""Inception""",8.364,34495,"""Released""","""2010-07-15""",825532764,148,False,"""/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.j…",160000000,"""https://www.warnerbros.com/mov…","""tt1375666""","""en""","""Inception""","""Cobb, a skilled thief who comm…",83.952,"""/oYuLEt3zVCKq57qu2F8dT7NIa6f.j…","""Your mind is the scene of the …","""Action, Science Fiction, Adven…","""Legendary Pictures, Syncopy, W…","""United Kingdom, United States …","""English, French, Japanese, Swa…","""rescue, mission, dream, airpla…"
157336,"""Interstellar""",8.417,32571,"""Released""","""2014-11-05""",701729206,169,False,"""/pbrkL804c8yAv3zBZR4QPEafpAR.j…",165000000,"""http://www.interstellarmovie.n…","""tt0816692""","""en""","""Interstellar""","""The adventures of a group of e…",140.241,"""/gEU2QniE6E77NI6lCU6MxlNBvIx.j…","""Mankind was born on Earth. It …","""Adventure, Drama, Science Fict…","""Legendary Pictures, Syncopy, L…","""United Kingdom, United States …","""English""","""rescue, future, spacecraft, ra…"
155,"""The Dark Knight""",8.512,30619,"""Released""","""2008-07-16""",1004558444,152,False,"""/nMKdUUepR0i5zn0y1T4CsSB5chy.j…",185000000,"""https://www.warnerbros.com/mov…","""tt0468569""","""en""","""The Dark Knight""","""Batman raises the stakes in hi…",130.643,"""/qJ2tW6WMUDux911r6m7haRef0WH.j…","""Welcome to a world without rul…","""Drama, Action, Crime, Thriller""","""DC Comics, Legendary Pictures,…","""United Kingdom, United States …","""English, Mandarin""","""joker, sadism, chaos, secret i…"
19995,"""Avatar""",7.573,29815,"""Released""","""2009-12-15""",2923706026,162,False,"""/vL5LR6WdxWPjLPFRLe133jXWsh5.j…",237000000,"""https://www.avatar.com/movies/…","""tt0499549""","""en""","""Avatar""","""In the 22nd century, a paraple…",79.932,"""/kyeqWdyUXW608qlYkRqosgbbJyK.j…","""Enter the world of Pandora.""","""Action, Adventure, Fantasy, Sc…","""Dune Entertainment, Lightstorm…","""United States of America, Unit…","""English, Spanish""","""future, society, culture clash…"
24428,"""The Avengers""",7.71,29166,"""Released""","""2012-04-25""",1518815515,143,False,"""/9BBTo63ANSmhC4e6r62OJFuK2GL.j…",220000000,"""https://www.marvel.com/movies/…","""tt0848228""","""en""","""The Avengers""","""When an unexpected enemy emerg…",98.082,"""/RYMX2wcKCBAr24UyPD7xwmjaTn.jp…","""Some assembly required.""","""Science Fiction, Action, Adven…","""Marvel Studios""","""United States of America""","""English, Hindi, Russian""","""new york city, superhero, shie…"


In [16]:
import polars as pl
import os

try:
    imdb_basics = pl.read_csv(
        os.path.join(IMDB_PATH, imdb_files['basics']),
        sep='\t',  # Use sep for consistency with pandas
        null_values='\\N',  # Use a single string if all problematic columns have the same null symbol
        low_memory=False,
        infer_schema=True,  # Let Polars try to infer the schema
        infer_schema_length=10000,  # Optionally increase schema inference rows
        with_columns=['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult', 'startYear', 'endYear', 'runtimeMinutes', 'genres']  # Only load specific columns if not all are needed
    )
    print(imdb_basics.head())

except Exception as e:
    print(f"An error occurred: {e}")

An error occurred: read_csv() got an unexpected keyword argument 'sep'


In [None]:
imdb_basics

In [None]:
# Joining datasets
joined_data = tmdb_data.join(imdb_basics, left_on='tmdb_id', right_on='imdb_id', how='inner')

In [None]:
imdb_basics = pd.read_csv("")

In [None]:
imdb_basics_filtered = imdb_dfs['basics'].filter(col("titleType") == "movie")

In [17]:
import dask.dataframe as dd

try:
    # Dask uses pandas-like syntax but can handle data in partitions (chunks)
    imdb_basics_dask = dd.read_csv(
        os.path.join(IMDB_PATH, imdb_files['basics']),
        sep='\t',
        assume_missing=True,  # Assume missing values and infer types
        na_values='\\N',
        usecols=['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult', 'startYear', 'endYear', 'runtimeMinutes', 'genres']  # Specify columns if not all are needed
    )

    # Compute to get the actual data in pandas DataFrame format, only after all manipulations
    imdb_basics_pd = imdb_basics_dask.compute()
    print(imdb_basics_pd.head())

except Exception as e:
    print(f"An error occurred: {e}")

An error occurred: Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.

+----------------+--------+----------+
| Column         | Found  | Expected |
+----------------+--------+----------+
| runtimeMinutes | object | float64  |
+----------------+--------+----------+

The following columns also raised exceptions on conversion:

- runtimeMinutes
  ValueError("could not convert string to float: 'Adult'")

Usually this is due to dask's dtype inference failing, and
*may* be fixed by specifying dtypes manually by adding:

dtype={'runtimeMinutes': 'object'}

to the call to `read_csv`/`read_table`.


  df = reader(bio, **kwargs)
  df = reader(bio, **kwargs)
  df = reader(bio, **kwargs)


In [18]:
import dask.dataframe as dd
import os

try:
    imdb_basics_dask = dd.read_csv(
        os.path.join(IMDB_PATH, imdb_files['basics']),
        sep='\t',
        assume_missing=True,  # Assume missing values for better type inference
        na_values='\\N',
        usecols=['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult', 'startYear', 'endYear', 'runtimeMinutes', 'genres'],
        dtype={'startYear': 'float64', 'endYear': 'float64', 'runtimeMinutes': 'object', 'isAdult': 'object'}  # Explicitly define data types
    )

    # Compute to get the actual data in pandas DataFrame format, only after all manipulations
    imdb_basics_pd = imdb_basics_dask.compute()
    print(imdb_basics_pd.head())

except Exception as e:
    print(f"An error occurred: {e}")

      tconst titleType            primaryTitle           originalTitle  \
0  tt0000001     short              Carmencita              Carmencita   
1  tt0000002     short  Le clown et ses chiens  Le clown et ses chiens   
2  tt0000003     short            Poor Pierrot          Pauvre Pierrot   
3  tt0000004     short             Un bon bock             Un bon bock   
4  tt0000005     short        Blacksmith Scene        Blacksmith Scene   

  isAdult  startYear  endYear runtimeMinutes                    genres  
0       0     1894.0      NaN              1         Documentary,Short  
1       0     1892.0      NaN              5           Animation,Short  
2       0     1892.0      NaN              5  Animation,Comedy,Romance  
3       0     1892.0      NaN             12           Animation,Short  
4       0     1893.0      NaN              1                     Short  


In [19]:
imdb_basics_pd.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894.0,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892.0,,5,"Animation,Short"
2,tt0000003,short,Poor Pierrot,Pauvre Pierrot,0,1892.0,,5,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892.0,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893.0,,1,Short


In [20]:
imdb_basics_pd.shape

(11485855, 9)

In [None]:
import dask.dataframe as dd
import os

try:
    imdb_principals_dask = dd.read_csv(
        os.path.join(IMDB_PATH, imdb_files['principals']),
        sep='\t',
        assume_missing=True,  # Assume missing values for better type inference
        na_values='\\N',
        usecols=['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult', 'startYear', 'endYear', 'runtimeMinutes', 'genres'],
        dtype={'startYear': 'float64', 'endYear': 'float64', 'runtimeMinutes': 'object', 'isAdult': 'object'}  # Explicitly define data types
    )

    # Compute to get the actual data in pandas DataFrame format, only after all manipulations
    imdb_basics_pd = imdb_basics_dask.compute()
    print(imdb_basics_pd.head())

except Exception as e:
    print(f"An error occurred: {e}")