In [1]:
import pandas as pd
from pathlib import Path

RAW = Path("../data/raw")
movies = pd.read_csv(RAW / "tmdb_5000_movies.csv", low_memory=False)
credits = pd.read_csv(RAW / "tmdb_5000_credits.csv", low_memory=False)

print("Movies shape:", movies.shape)
print("Credits shape:", credits.shape)
movies.head(3)

Movies shape: (4803, 20)
Credits shape: (4803, 4)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466


In [2]:
def profile(df):
    info = pd.DataFrame({
        "column": df.columns,
        "dtype": [str(t) for t in df.dtypes],
        "non_null": df.notna().sum().values,
        "nulls": df.isna().sum().values,
    })
    info["null_pct"] = (info["nulls"] / len(df) * 100).round(2)
    return info

movies_info = profile(movies)
credits_info = profile(credits)

movies_info, credits_info

(                  column    dtype  non_null  nulls  null_pct
 0                 budget    int64      4803      0      0.00
 1                 genres   object      4803      0      0.00
 2               homepage   object      1712   3091     64.36
 3                     id    int64      4803      0      0.00
 4               keywords   object      4803      0      0.00
 5      original_language   object      4803      0      0.00
 6         original_title   object      4803      0      0.00
 7               overview   object      4800      3      0.06
 8             popularity  float64      4803      0      0.00
 9   production_companies   object      4803      0      0.00
 10  production_countries   object      4803      0      0.00
 11          release_date   object      4802      1      0.02
 12               revenue    int64      4803      0      0.00
 13               runtime  float64      4801      2      0.04
 14      spoken_languages   object      4803      0      0.00
 15     

In [3]:
movies.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
budget,4803.0,29045040.0,40722390.0,0.0,790000.0,15000000.0,40000000.0,380000000.0
id,4803.0,57165.48,88694.61,5.0,9014.5,14629.0,58610.5,459488.0
popularity,4803.0,21.4923,31.81665,0.0,4.66807,12.92159,28.3135,875.5813
revenue,4803.0,82260640.0,162857100.0,0.0,0.0,19170000.0,92917190.0,2787965000.0
runtime,4801.0,106.8759,22.61193,0.0,94.0,103.0,118.0,338.0
vote_average,4803.0,6.092172,1.194612,0.0,5.6,6.2,6.8,10.0
vote_count,4803.0,690.218,1234.586,0.0,54.0,235.0,737.0,13752.0


In [4]:
# Choose suffixes so movie columns keep their names and credits get a suffix
df_merged = movies.merge(
    credits,
    left_on="id",
    right_on="movie_id",
    how="left",
    validate="one_to_one",
    suffixes=("", "_credits"),
)

print("Merged shape:", df_merged.shape)

# Sometimes 'title' might be 'original_title' in some dumps—handle both.
title_col = "title" if "title" in df_merged.columns else "original_title"

# Pick only columns that actually exist (robust to schema quirks)
want = [col for col in [
    "id", title_col, "budget", "revenue", "popularity", "vote_average", "vote_count"
] if col in df_merged.columns]

df_merged[want].head(5)

Merged shape: (4803, 24)


Unnamed: 0,id,title,budget,revenue,popularity,vote_average,vote_count
0,19995,Avatar,237000000,2787965087,150.437577,7.2,11800
1,285,Pirates of the Caribbean: At World's End,300000000,961000000,139.082615,6.9,4500
2,206647,Spectre,245000000,880674609,107.376788,6.3,4466
3,49026,The Dark Knight Rises,250000000,1084939099,112.31295,7.6,9106
4,49529,John Carter,260000000,284139100,43.926995,6.1,2124


In [5]:
OUT = Path("../reports")
OUT.mkdir(parents=True, exist_ok=True)

movies_info.to_csv(OUT / "movies_columns_profile.csv", index=False)
credits_info.to_csv(OUT / "credits_columns_profile.csv", index=False)

with open(OUT / "initial_scan.md", "w", encoding="utf-8") as f:
    f.write("# Initial Data Scan (TMDB 5000)\n\n")
    f.write(f"- Movies shape: {movies.shape}\n")
    f.write(f"- Credits shape: {credits.shape}\n")
    f.write("- Nulls by column: see CSV profiles.\n")
    f.write("- Basic stats exported via notebook.\n")