<a href="https://colab.research.google.com/github/ashleynguyen04/DS4002/blob/main/Project1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
#Creating dataset
#Download "Large Movie Review Dataset v1.0" from https://ai.stanford.edu/~amaas/data/sentiment/
#Download "title.basics.tsv.gz" from https://datasets.imdbws.com/
#Upload aclImd zip file and extracted title.basic.tsv to colab

import os
import re
import tarfile
import zipfile
import pandas as pd
from pathlib import Path
from google.colab import files

# ------ CONFIG: set paths to your uploaded files in Colab ------
# Use ONE of these: a .tar.gz OR a .zip. Leave the other as None.
ACL_ARCHIVE_TGZ = Path("/aclImdb_v1.tar.gz")   # e.g., "/content/aclImdb_v1.tar.gz"
ACL_ARCHIVE_ZIP = None                                  # or Path("/content/aclImdb.zip") if you uploaded a zip
TITLE_BASICS_PATH = Path("/title.basics.tsv")   # IMDb title.basics.tsv
ACL_ROOT_DIR = Path("/content/aclImdb")                 # will exist after extraction
OUTPUT_CSV = Path("/content/imdb_sentiment_with_genres.csv")
# ---------------------------------------------------------------

TT_RE = re.compile(r"(tt\d+)")

def extract_archive_if_needed(target_dir: Path):
    """
    Extract either a tar.gz or zip archive into /content, creating aclImdb/ with train/test pos/neg.
    """
    if target_dir.exists() and (target_dir / "train").exists() and (target_dir / "test").exists():
        print(f"Found existing {target_dir}, skipping extraction.")
        return

    if ACL_ARCHIVE_TGZ and Path(ACL_ARCHIVE_TGZ).exists():
        print(f"Extracting {ACL_ARCHIVE_TGZ} -> /content/ ...")
        with tarfile.open(ACL_ARCHIVE_TGZ, "r:gz") as tf:
            tf.extractall("/content/")
        print("Extraction (tar.gz) complete.")
    elif ACL_ARCHIVE_ZIP and Path(ACL_ARCHIVE_ZIP).exists():
        print(f"Extracting {ACL_ARCHIVE_ZIP} -> /content/ ...")
        with zipfile.ZipFile(ACL_ARCHIVE_ZIP, "r") as zf:
            zf.extractall("/content/")
        print("Extraction (zip) complete.")
    else:
        raise FileNotFoundError(
            "No archive found. Upload aclImdb_v1.tar.gz (or aclImdb.zip) to /content "
            "and set ACL_ARCHIVE_TGZ or ACL_ARCHIVE_ZIP accordingly."
        )

def find_acl_root(base: Path) -> Path:
    """
    Return a directory that contains 'train/pos','train/neg','test/pos','test/neg'.
    Searches under 'base' if needed.
    """
    def is_acl_root(p: Path) -> bool:
        return (p / "train" / "pos").exists() and (p / "train" / "neg").exists() \
           and (p / "test" / "pos").exists() and (p / "test" / "neg").exists()

    if is_acl_root(base):
        return base

    for root, dirs, files in os.walk(base):
        p = Path(root)
        if is_acl_root(p):
            return p

    raise RuntimeError("Could not find aclImdb root with train/test pos/neg structure under: " + str(base))

def read_split_sentiment(split_dir: Path, sentiment: str) -> pd.DataFrame:
    """
    Read reviews from split ('train'/'test') and sentiment ('pos'/'neg'),
    aligned to urls_{sentiment}.txt to extract IMDb tt IDs.
    Returns columns: review_path, review, rating, tt_id
    """
    reviews_dir = split_dir / sentiment
    urls_file = split_dir / f"urls_{sentiment}.txt"

    review_files = sorted([p for p in reviews_dir.glob("*.txt") if p.is_file()])
    if not review_files:
        print(f"WARNING: No review files in {reviews_dir}")
        return pd.DataFrame(columns=["review_path", "review", "rating", "tt_id"])

    if not urls_file.exists():
        raise FileNotFoundError(f"Missing URLs file: {urls_file}")

    with urls_file.open("r", encoding="utf-8") as f:
        urls = [line.strip() for line in f if line.strip()]

    # Align counts safely
    n = min(len(urls), len(review_files))
    if len(urls) != len(review_files):
        print(f"NOTE: {split_dir.name}-{sentiment} count mismatch. "
              f"URLs={len(urls)} reviews={len(review_files)}. Truncating to {n}.")
    urls = urls[:n]
    review_files = review_files[:n]

    # Extract tt IDs
    tt_ids = []
    for u in urls:
        m = TT_RE.search(u)
        tt_ids.append(m.group(1) if m else None)

    rows = []
    for fpath, tt in zip(review_files, tt_ids):
        with fpath.open("r", encoding="utf-8") as rf:
            text = rf.read().strip()
        rows.append({
            "review_path": str(fpath),
            "review": text,
            "rating": "pos" if sentiment == "pos" else "neg",  # 'pos'/'neg' as requested
            "tt_id": tt
        })
    return pd.DataFrame(rows)

def load_acl_imdb_all(acl_root: Path) -> pd.DataFrame:
    dfs = []
    for split in ["train", "test"]:
        sd = acl_root / split
        if not sd.exists():
            print(f"WARNING: Missing split dir {sd}")
            continue
        for sentiment in ["pos", "neg"]:
            dfs.append(read_split_sentiment(sd, sentiment))
    if not dfs:
        return pd.DataFrame(columns=["review_path", "review", "rating", "tt_id"])
    return pd.concat(dfs, ignore_index=True)

def load_title_basics(path: Path) -> pd.DataFrame:
    if not path.exists():
        raise FileNotFoundError(f"Missing title.basics.tsv at {path}. Upload it to /content.")
    df = pd.read_csv(path, sep="\t", dtype=str, na_values="\\N")
    df = df[["tconst", "primaryTitle", "genres"]]
    df["genres"] = df["genres"].fillna("")
    return df

# --------------- RUN THE PIPELINE ---------------
try:
    # 1) Extract archive (tar.gz or zip)
    extract_archive_if_needed(ACL_ROOT_DIR)

    # 2) Locate the correct aclImdb root
    acl_root = find_acl_root(ACL_ROOT_DIR)
    print(f"Using ACL root: {acl_root}")

    # 3) Load reviews (train+test, pos+neg)
    print("Loading ACL IMDB reviews…")
    reviews_df = load_acl_imdb_all(acl_root)
    if reviews_df.empty:
        raise RuntimeError("No reviews loaded. Check dataset structure and urls_*.txt files.")

    # 4) Load IMDb titles
    print("Loading IMDb title.basics.tsv…")
    titles_df = load_title_basics(TITLE_BASICS_PATH)

    # 5) Merge on tt_id
    print("Merging on IMDb IDs…")
    reviews_df = reviews_df.dropna(subset=["tt_id"])
    need_ids = reviews_df["tt_id"].unique()
    titles_small = titles_df[titles_df["tconst"].isin(need_ids)].copy()

    merged = reviews_df.merge(
        titles_small,
        left_on="tt_id", right_on="tconst", how="left"
    )

    # 6) Build final columns
    merged["movie_name"] = merged["primaryTitle"].fillna("Unknown Title")
    merged["genres"] = merged["genres"].fillna("").astype(str)

    # Long 'genre' by exploding 'genres'
    merged["genres_list"] = merged["genres"].apply(lambda s: [] if s.strip() == "" else s.split(","))
    long_df = merged.explode("genres_list", ignore_index=True)
    long_df = long_df.rename(columns={"genres_list": "genre"})
    long_df.loc[long_df["genre"] == "", "genre"] = None

    # Select and order final columns (renaming tt_id -> imdb_identifier; keep rating)
    final = long_df.rename(columns={"tt_id": "imdb_identifier"})[
        ["imdb_identifier", "movie_name", "genres", "genre", "review", "rating"]
    ].copy()

    # 7) Cast ALL columns to pandas StringDtype (string)
    final = final.astype(pd.StringDtype())

    # Sanity check
    print(final.dtypes)

    # 8) Save and download
    final.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
    print(f"Done. Wrote {len(final):,} rows to: {OUTPUT_CSV}")

    # Trigger browser download to your laptop
    files.download(str(OUTPUT_CSV))

except Exception as e:
    print("\nERROR:", e)
    raise


Found existing /content/aclImdb, skipping extraction.
Using ACL root: /content/aclImdb
Loading ACL IMDB reviews…
NOTE: train-pos count mismatch. URLs=12500 reviews=7327. Truncating to 7327.
Loading IMDb title.basics.tsv…
Merging on IMDb IDs…
imdb_identifier    string[python]
movie_name         string[python]
genres             string[python]
genre              string[python]
review             string[python]
rating             string[python]
dtype: object
Done. Wrote 102,011 rows to: /content/imdb_sentiment_with_genres.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [19]:
import pandas as pd

df = pd.read_csv("/content/imdb_sentiment_with_genres.csv")

print(df.shape)
print(df.columns)
print(df.dtypes)

df.head(20)

(102011, 6)
Index(['imdb_identifier', 'movie_name', 'genres', 'genre', 'review', 'rating'], dtype='object')
imdb_identifier    object
movie_name         object
genres             object
genre              object
review             object
rating             object
dtype: object


Unnamed: 0,imdb_identifier,movie_name,genres,genre,review,rating
0,tt0453418,Bromwell High,"Animation,Comedy",Animation,Bromwell High is a cartoon comedy. It ran at t...,pos
1,tt0453418,Bromwell High,"Animation,Comedy",Comedy,Bromwell High is a cartoon comedy. It ran at t...,pos
2,tt0453418,Bromwell High,"Animation,Comedy",Animation,I liked the film. Some of the action scenes we...,pos
3,tt0453418,Bromwell High,"Animation,Comedy",Comedy,I liked the film. Some of the action scenes we...,pos
4,tt0453418,Bromwell High,"Animation,Comedy",Animation,Somewhat funny and well-paced action thriller ...,pos
5,tt0453418,Bromwell High,"Animation,Comedy",Comedy,Somewhat funny and well-paced action thriller ...,pos
6,tt0064354,Futz,Comedy,Comedy,Just two comments....SEVEN years apart? Hardly...,pos
7,tt0064354,Futz,Comedy,Comedy,"Another Aussie masterpiece, this delves into t...",pos
8,tt0100680,Stanley & Iris,"Drama,Romance",Drama,After a brief prologue showing a masked man st...,pos
9,tt0100680,Stanley & Iris,"Drama,Romance",Romance,After a brief prologue showing a masked man st...,pos
