<a href="https://colab.research.google.com/github/ashleynguyen04/DS4002/blob/main/Project1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
#Creating dataset
#Download "Large Movie Review Dataset v1.0" from https://ai.stanford.edu/~amaas/data/sentiment/
#Download "title.basics.tsv.gz" from https://datasets.imdbws.com/
#Upload aclImd zip file and extracted title.basic.tsv to colab

import os
import re
import tarfile
import pandas as pd
from pathlib import Path
from google.colab import files

#Inputs
ACL_TAR = Path("/content/aclImdb_v1.tar.gz")        # your uploaded archive
TITLE_BASICS_PATH = Path("/content/title.basics.tsv")  # your uploaded TSV
ACL_ROOT_DIR = Path("/content/aclImdb")             # will exist after extract
OUTPUT_CSV = Path("/content/imdb_sentiment_with_genres.csv")

TT_RE = re.compile(r"(tt\d+)")

def extract_if_needed(tar_path: Path, target_dir: Path):
    if target_dir.exists() and (target_dir / "train").exists():
        print(f"Found existing {target_dir}, skipping extract.")
        return
    if not tar_path.exists():
        raise FileNotFoundError(f"Missing tar.gz at {tar_path}. Upload it to /content.")
    print(f"Extracting {tar_path} -> /content/ ...")
    with tarfile.open(tar_path, "r:gz") as tf:
        tf.extractall("/content/")
    print("Extraction complete.")

def find_acl_root(base: Path) -> Path:
    def is_acl_root(p: Path) -> bool:
        return (p / "train" / "pos").exists() and (p / "train" / "neg").exists() \
           and (p / "test" / "pos").exists() and (p / "test" / "neg").exists()
    if is_acl_root(base):
        return base
    for root, dirs, files in os.walk(base):
        p = Path(root)
        if is_acl_root(p):
            return p
    raise RuntimeError("Could not find aclImdb root with train/test pos/neg structure.")

def read_split_sentiment(split_dir: Path, sentiment: str) -> pd.DataFrame:
    reviews_dir = split_dir / sentiment
    urls_file = split_dir / f"urls_{sentiment}.txt"

    review_files = sorted([p for p in reviews_dir.glob("*.txt") if p.is_file()])
    if not review_files:
        print(f"WARNING: No review files in {reviews_dir}")
        return pd.DataFrame(columns=["review_path", "review", "sentiment", "tt_id"])

    if not urls_file.exists():
        raise FileNotFoundError(f"Missing URLs file: {urls_file}")

    with urls_file.open("r", encoding="utf-8") as f:
        urls = [line.strip() for line in f if line.strip()]

    # align counts
    n = min(len(urls), len(review_files))
    if len(urls) != len(review_files):
        print(f"NOTE: {split_dir.name}-{sentiment} count mismatch. "
              f"URLs={len(urls)} reviews={len(review_files)}. Truncating to {n}.")
    review_files = review_files[:n]
    urls = urls[:n]

    # extract tt IDs
    tt_ids = []
    for u in urls:
        m = TT_RE.search(u)
        tt_ids.append(m.group(1) if m else None)

    rows = []
    for fpath, tt in zip(review_files, tt_ids):
        with fpath.open("r", encoding="utf-8") as rf:
            text = rf.read().strip()
        rows.append({
            "review_path": str(fpath),
            "review": text,
            "sentiment": "positive" if sentiment == "pos" else "negative",
            "tt_id": tt
        })
    return pd.DataFrame(rows)

def load_acl_imdb_all(acl_root: Path) -> pd.DataFrame:
    dfs = []
    for split in ["train", "test"]:
        split_dir = acl_root / split
        if not split_dir.exists():
            print(f"WARNING: Missing split dir {split_dir}")
            continue
        for sentiment in ["pos", "neg"]:
            dfs.append(read_split_sentiment(split_dir, sentiment))
    return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame(
        columns=["review_path", "review", "sentiment", "tt_id"]
    )

def load_title_basics(path: Path) -> pd.DataFrame:
    if not path.exists():
        raise FileNotFoundError(f"Missing title.basics.tsv at {path}. Upload it to /content.")
    df = pd.read_csv(path, sep="\t", dtype=str, na_values="\\N")
    df = df[["tconst", "primaryTitle", "genres"]]
    df["genres"] = df["genres"].fillna("")
    return df

# ---- Run pipeline ----
try:
    # 1) Extract tar.gz if needed
    extract_if_needed(ACL_TAR, ACL_ROOT_DIR)

    # 2) Locate ACL root
    acl_root = find_acl_root(ACL_ROOT_DIR)
    print(f"Using ACL root: {acl_root}")

    # 3) Load reviews
    print("Loading ACL IMDB reviews…")
    reviews_df = load_acl_imdb_all(acl_root)
    if reviews_df.empty:
        raise RuntimeError("No reviews loaded. Check dataset structure and urls_*.txt files.")

    # 4) Load IMDb titles
    print("Loading IMDb title.basics.tsv…")
    titles_df = load_title_basics(TITLE_BASICS_PATH)

    # 5) Merge on tt_id
    print("Merging on IMDb IDs…")
    reviews_df = reviews_df.dropna(subset=["tt_id"])
    need_ids = reviews_df["tt_id"].unique()
    titles_small = titles_df[titles_df["tconst"].isin(need_ids)].copy()
    merged = reviews_df.merge(
        titles_small,
        left_on="tt_id", right_on="tconst", how="left"
    )

    # 6) Build final columns
    merged["movie_name"] = merged["primaryTitle"].fillna("Unknown Title")
    merged["genres"] = merged["genres"].fillna("").astype(str)
    merged["genres_list"] = merged["genres"].apply(lambda s: [] if s.strip() == "" else s.split(","))
    long_df = merged.explode("genres_list", ignore_index=True)
    long_df = long_df.rename(columns={"genres_list": "genre"})
    long_df.loc[long_df["genre"] == "", "genre"] = None

    final = long_df[["movie_name", "genres", "genre", "review", "sentiment"]].copy()
    final["movie_name"] = final["movie_name"].str.strip()
    if final["genre"].notna().any():
        final["genre"] = final["genre"].str.strip()

    # 7) Save + download
    final.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
    print(f"Done. Wrote {len(final):,} rows to: {OUTPUT_CSV}")

    files.download(str(OUTPUT_CSV))

except Exception as e:
    print("\nERROR:", e)
    raise


Extracting /content/aclImdb_v1.tar.gz -> /content/ ...


  tf.extractall("/content/")


Extraction complete.
Using ACL root: /content/aclImdb
Loading ACL IMDB reviews…
Loading IMDb title.basics.tsv…
Merging on IMDb IDs…
Done. Wrote 114,139 rows to: /content/imdb_sentiment_with_genres.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [22]:
import pandas as pd

df = pd.read_csv("/content/imdb_sentiment_with_genres.csv")

print(df.shape)
print(df.columns)

df.head()

(114139, 5)
Index(['movie_name', 'genres', 'genre', 'review', 'sentiment'], dtype='object')


Unnamed: 0,movie_name,genres,genre,review,sentiment
0,Bromwell High,"Animation,Comedy",Animation,Bromwell High is a cartoon comedy. It ran at t...,positive
1,Bromwell High,"Animation,Comedy",Comedy,Bromwell High is a cartoon comedy. It ran at t...,positive
2,Bromwell High,"Animation,Comedy",Animation,Homelessness (or Houselessness as George Carli...,positive
3,Bromwell High,"Animation,Comedy",Comedy,Homelessness (or Houselessness as George Carli...,positive
4,Bromwell High,"Animation,Comedy",Animation,Brilliant over-acting by Lesley Ann Warren. Be...,positive
