<a href="https://colab.research.google.com/github/alortiz05/DDDS-Cohort-16-Projects/blob/main/3M_IMBD_Data_Pull_Part_I.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [None]:
pip install duckdb



In [None]:
import requests

imdb_files = [
    'https://datasets.imdbws.com/name.basics.tsv.gz',
    'https://datasets.imdbws.com/title.akas.tsv.gz',
    'https://datasets.imdbws.com/title.basics.tsv.gz',
    'https://datasets.imdbws.com/title.crew.tsv.gz',
    'https://datasets.imdbws.com/title.episode.tsv.gz',
    'https://datasets.imdbws.com/title.principals.tsv.gz',
    'https://datasets.imdbws.com/title.ratings.tsv.gz'
]

print("IMDb file sizes (compressed):")
for url in imdb_files:
    try:
        response = requests.head(url, allow_redirects=True)
        size_bytes = int(response.headers.get('Content-Length', 0))
        size_mb = size_bytes / (1024 * 1024)
        print(f"{url.split('/')[-1]}: {size_mb:.2f} MB")
    except Exception as e:
        print(f"Failed to get size for {url}: {e}")

IMDb file sizes (compressed):
name.basics.tsv.gz: 272.99 MB
title.akas.tsv.gz: 434.48 MB
title.basics.tsv.gz: 197.72 MB
title.crew.tsv.gz: 73.12 MB
title.episode.tsv.gz: 47.50 MB
title.principals.tsv.gz: 687.22 MB
title.ratings.tsv.gz: 7.58 MB


In [None]:
import duckdb
import os
import requests

from google.colab import drive
drive.mount('/content/drive')

download_dir = "//content/drive/MyDrive/Capstone Project/imdb_data"  # Save directly to your Drive
os.makedirs(download_dir, exist_ok=True)

imdb_files = [
    'https://datasets.imdbws.com/name.basics.tsv.gz',
    'https://datasets.imdbws.com/title.akas.tsv.gz',
    'https://datasets.imdbws.com/title.basics.tsv.gz',
    'https://datasets.imdbws.com/title.crew.tsv.gz',
    'https://datasets.imdbws.com/title.episode.tsv.gz',
    'https://datasets.imdbws.com/title.principals.tsv.gz',
    'https://datasets.imdbws.com/title.ratings.tsv.gz'
]

for url in imdb_files:
    file_name = os.path.join(download_dir, url.split('/')[-1])
    if os.path.exists(file_name):
        print(f"{file_name} already exists.")
        continue
    print(f"Downloading {url} ...")
    response = requests.get(url, stream=True)
    with open(file_name, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            if chunk:
                f.write(chunk)
    print(f"Saved to {file_name}")

Mounted at /content/drive
//content/drive/MyDrive/Capstone Project/imdb_data/name.basics.tsv.gz already exists.
//content/drive/MyDrive/Capstone Project/imdb_data/title.akas.tsv.gz already exists.
//content/drive/MyDrive/Capstone Project/imdb_data/title.basics.tsv.gz already exists.
//content/drive/MyDrive/Capstone Project/imdb_data/title.crew.tsv.gz already exists.
//content/drive/MyDrive/Capstone Project/imdb_data/title.episode.tsv.gz already exists.
//content/drive/MyDrive/Capstone Project/imdb_data/title.principals.tsv.gz already exists.
//content/drive/MyDrive/Capstone Project/imdb_data/title.ratings.tsv.gz already exists.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


- did not use title_episode because we are looking at movies
- made sure to not pull in multiple tconst
-

In [None]:

# Connect to DuckDB in-memory (or specify a file for persistent DB)
con = duckdb.connect()

# Ensure download_dir points to the location where files were saved
# This should match the download_dir used in the previous cell
download_dir = "/content/drive/MyDrive/Capstone Project/imdb_data"

# Modify the query to use local file paths instead of URLs
query = f"""
WITH title_data AS (
    SELECT -- these are all the columns I want to include from
        tb.tconst,
        tb.originalTitle,
        tb.runtimeMinutes,
        tb.startYear,
        y.genre, --All Columns from title.basics
        tr.averageRating,
        tn.region
    FROM read_csv_auto('{download_dir}/title.basics.tsv.gz', compression='gzip', sep='\t', nullstr='\\N') tb

    LEFT JOIN LATERAL (
        SELECT UNNEST(STRING_SPLIT(tb.genres, ',')) AS genre
    ) AS y ON TRUE

    LEFT JOIN read_csv_auto('{download_dir}/title.crew.tsv.gz', compression='gzip', sep='\t', nullstr='\\N') tc
         ON tb.tconst = tc.tconst
    LEFT JOIN read_csv_auto('{download_dir}/title.principals.tsv.gz', compression='gzip', sep='\t', nullstr='\\N') tp
        ON tb.tconst = tp.tconst
    LEFT JOIN read_csv_auto('{download_dir}/title.ratings.tsv.gz', compression='gzip', sep='\t', nullstr='\\N') tr
        ON tb.tconst = tr.tconst
    LEFT JOIN read_csv_auto('{download_dir}/title.akas.tsv.gz', compression='gzip', sep='\t', nullstr='\\N') tn
        ON tb.tconst = tn.titleID
    WHERE tb.titleType = 'movie'
 ),

name_data AS (
    SELECT
        nb.primaryName,
        nb.PrimaryProfession,
        g.tconst2,
        h.Primary as primaryProfessionSplit
    FROM read_csv_auto('{download_dir}/name.basics.tsv.gz', compression='gzip', sep='\t', nullstr='\\N') nb,
    LATERAL (SELECT UNNEST(STRING_SPLIT(nb.knownForTitles, ',')) AS tconst2) AS g,
    LATERAL (SELECT UNNEST(STRING_SPLIT(nb.primaryProfession, ',')) AS Primary) AS h
)

combined AS (
    SELECT
        td.tconst,
        td.originalTitle,
        td.runtimeMinutes,
        td.genre,
        td.averageRating,
        td.region,
        nd.primaryName,
        nd.primaryProfessionSplit,
        td.startYear
    FROM title_data td
    LEFT JOIN name_data nd
        ON td.tconst = nd.tconst2
)

SELECT
    tconst,
    originalTitle,
    runtimeMinutes,
    genre,
    averageRating,
    region,
    primaryName,
    primaryProfessionSplit
FROM combined
ORDER BY startYear

LIMIT 2000000
"""

# Step 3: Run the query with LIMIT inside
con.execute(query)

# Step 4: Fetch result into pandas
df = con.fetchdf()

ParserException: Parser Error: syntax error at or near "combined"

In [None]:
print(df.shape)

In [None]:
print(df.columns.tolist())

In [None]:
print(df.head())

In [None]:
print(df.info())


In [None]:
df.to_parquet("/content/drive/MyDrive/Capstone Project/imdb_ml_data2M.parquet", index=False)
print("✅ Data saved as Parquet to Google Drive.")


In [None]:
import pandas as pd
df = pd.read_parquet("//content/drive/MyDrive/Capstone Project/imdb_ml_data2M.parquet")
df.head(100)

In [None]:
filtered_df2 = df[df['originalTitle'] == 'The Ring']
filtered_df2.drop(columns=['averageRating']) #removing the rating column before using it to rest the model.


In [None]:
df_sample = df.sample(n=900000, random_state=42)

# Step 3: Drop rows where target is missing
df_ml = df_sample[df_sample['averageRating'].notna()]

# Step 4: Define target and features
target = 'averageRating'
features = [
    'runtimeMinutes',
    'tconst',
    'genre',
    'originalTitle',
    'category',
    'primaryName'
]
df_ml = df_ml[features + [target]].dropna()

# Step 5: Train/test split
X = df_ml[features]
y = df_ml[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Preprocessing and model pipeline
categorical = ['tconst','genre', 'originalTitle', 'category', 'primaryName']
numerical = ['runtimeMinutes']

preprocessor = ColumnTransformer([
    ('num', SimpleImputer(strategy='mean'), numerical),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical)
])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Step 7: Train the model
model.fit(X_train, y_train)

# Step 8: Predict and evaluate
y_pred = model.predict(X_test)
print(y_pred)
print("RMSE:", mean_squared_error(y_test, y_pred))
print("R^2 Score:", r2_score(y_test, y_pred))

In [None]:
importances = model.named_steps['regressor'].feature_importances_
feature_names = model.named_steps['preprocessor'].get_feature_names_out()
importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
print(importance_df.sort_values(by='importance', ascending=False).head(10))