In [5]:
# --- 01_quality_checks.ipynb ---
import pandas as pd

tmdb = pd.read_csv("../data/raw/tmdb_movies_100.csv")
rt = pd.read_csv("../data/raw/rt_omdb_sample.csv")

print("✅ Loaded TMDb and Rotten Tomatoes data successfully!")




✅ Loaded TMDb and Rotten Tomatoes data successfully!


In [7]:
for name, df in [("TMDb", tmdb), ("RottenTomatoes", rt)]:
    print(f"\n===== {name} =====")
    print(df.info())
    print("\nMissing values per column:")
    print(df.isna().mean().round(2))




===== TMDb =====
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              100 non-null    int64  
 1   title           100 non-null    object 
 2   original_title  100 non-null    object 
 3   release_date    100 non-null    object 
 4   budget          100 non-null    int64  
 5   revenue         100 non-null    float64
 6   runtime         100 non-null    float64
 7   genres          98 non-null     object 
 8   popularity      100 non-null    float64
 9   release_year    100 non-null    int64  
 10  cast_top3       15 non-null     object 
 11  director        5 non-null      object 
dtypes: float64(3), int64(3), object(6)
memory usage: 9.5+ KB
None

Missing values per column:
id                0.00
title             0.00
original_title    0.00
release_date      0.00
budget            0.00
revenue           0.00
runtime        

In [9]:
tmdb.head()
rt.head()



Unnamed: 0,title,year,imdb_rating,rt_score,metascore
0,Toy Story,1995,8.3,100%,96.0
1,Jumanji,1995,7.1,53%,39.0
2,Grumpier Old Men,1995,6.7,20%,46.0
3,Waiting to Exhale,1995,6.0,56%,
4,Father of the Bride Part II,1995,6.1,54%,49.0


In [10]:
# --- Quality checks summary (2 datasets only) ---

print("TMDb budget stats:\n", tmdb["budget"].describe())
print("RT score distribution:\n", rt["rt_score"].value_counts().head())

for name, df in [("TMDb", tmdb), ("RT", rt)]:
    print(f"{name} duplicates: {df.duplicated().sum()}")

summary_text = """
# Data Quality Summary

## TMDb
- {tmdb_rows} rows, {tmdb_cols} columns
- Missing values: {tmdb_na:.2%}
- Common issues: Some movies missing directors or cast.

## Rotten Tomatoes (OMDb)
- {rt_rows} rows, {rt_cols} columns
- Missing values: {rt_na:.2%}
- Notes: Many missing RT scores for older films.
""".format(
    tmdb_rows=tmdb.shape[0],
    tmdb_cols=tmdb.shape[1],
    tmdb_na=tmdb.isna().mean().mean(),
    rt_rows=rt.shape[0],
    rt_cols=rt.shape[1],
    rt_na=rt.isna().mean().mean(),
)

with open("../reports/quality_summary.md", "w") as f:
    f.write(summary_text)

print("✅ Saved summary to reports/quality_summary.md")


TMDb budget stats:
 count    1.000000e+02
mean     1.494419e+07
std      2.106925e+07
min      0.000000e+00
25%      0.000000e+00
50%      3.250000e+06
75%      2.000000e+07
max      9.800000e+07
Name: budget, dtype: float64
RT score distribution:
 rt_score
90%    2
97%    2
36%    2
56%    2
54%    2
Name: count, dtype: int64
TMDb duplicates: 0
RT duplicates: 0
✅ Saved summary to reports/quality_summary.md


In [3]:

import pandas as pd

links = pd.read_csv("../data/intermediate/link_candidates.csv")
print("✅ Loaded link candidates")
print(links.head())
print(f"Total candidate matches: {len(links)}")


avg_similarity = links["similarity"].mean().round(3)
high_conf = links[links["similarity"] >= 0.9]

print("\n📊 Integration Metrics:")
print(f"- Total candidate matches: {len(links)}")
print(f"- High-confidence matches (≥0.9): {len(high_conf)}")
print(f"- Average similarity: {avg_similarity}")



✅ Loaded link candidates
   tmdb_id                   tmdb_title                     rt_title  rt_year  \
0      862                    Toy Story                    Toy Story     1995   
1     8844                      Jumanji                      Jumanji     1995   
2    15602             Grumpier Old Men             Grumpier Old Men     1995   
3    31357            Waiting to Exhale            Waiting to Exhale     1995   
4    11862  Father of the Bride Part II  Father of the Bride Part II     1995   

   similarity  
0         1.0  
1         1.0  
2         1.0  
3         1.0  
4         1.0  
Total candidate matches: 46

📊 Integration Metrics:
- Total candidate matches: 46
- High-confidence matches (≥0.9): 46
- Average similarity: 1.0


In [4]:
tmdb = pd.read_csv("../data/raw/tmdb_movies_100.csv")
rt = pd.read_csv("../data/raw/rt_omdb_sample.csv")

merged = links.merge(tmdb, left_on="tmdb_title", right_on="title", how="left") \
              .merge(rt, left_on="rt_title", right_on="title", how="left", suffixes=("_tmdb", "_rt"))

rating_coverage = merged["rt_score"].notna().mean().round(2) * 100
print(f"Rating coverage (TMDb + RT): {rating_coverage}%")


Rating coverage (TMDb + RT): 93.0%


In [6]:
from pathlib import Path

Path("../reports").mkdir(exist_ok=True)

report = f"""# Integration Metrics (Week 7–9)

| Metric | Value |
|---------|--------|
| Candidate matches found | {len(links)} |
| High-confidence matches (≥ 0.9) | {len(high_conf)} |
| Avg. title similarity | {avg_similarity} |
| Rating coverage (TMDb + RT) | {rating_coverage}% |
| Notes | All 46 pairs are perfect title matches (1.0 similarity). Minor manual check confirms correctness. |
"""

with open("../reports/integration_metrics.md", "w") as f:
    f.write(report)

print("✅ Saved integration metrics to ../reports/integration_metrics.md")


✅ Saved integration metrics to ../reports/integration_metrics.md
