<a href="https://colab.research.google.com/github/ashishsingh0069/CODSOFT/blob/main/MOVIE_GENRE_CLASSIFICATION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("hijest/genre-classification-dataset-imdb")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/hijest/genre-classification-dataset-imdb?dataset_version_number=1...


100%|██████████| 41.7M/41.7M [00:02<00:00, 15.7MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/hijest/genre-classification-dataset-imdb/versions/1


In [6]:

import os
import zipfile
import pandas as pd
from glob import glob

print("PATH:", path)

# If path is a zip file, extract it to /content/dataset (colab default)
extract_dir = "/content/dataset"
os.makedirs(extract_dir, exist_ok=True)

if os.path.isfile(path) and path.lower().endswith(".zip"):
    print("Extracting zip...")
    with zipfile.ZipFile(path, "r") as z:
        z.extractall(extract_dir)
else:
    # if path is a directory, check if it contains a single sub-directory that holds the actual data
    print("Processing directory from kagglehub...")
    contained_items = os.listdir(path)

    # Check if there's only one item and it's a directory
    if len(contained_items) == 1 and os.path.isdir(os.path.join(path, contained_items[0])):
        # Update 'path' to point to this inner directory
        print(f"Detected single subdirectory '{contained_items[0]}'. Using it as the new data path.")
        path = os.path.join(path, contained_items[0])

    for item_name in os.listdir(path):
        src_path = os.path.join(path, item_name)
        dest_path = os.path.join(extract_dir, item_name)

        try:
            import shutil
            if os.path.isfile(src_path):
                shutil.copy(src_path, dest_path)
            elif os.path.isdir(src_path):
                shutil.copytree(src_path, dest_path, dirs_exist_ok=True)
            print(f"Copied {src_path} to {dest_path}")
        except Exception as e:
            print(f"Error copying {src_path}: {e}")

print("Files in extracted folder:")
for f in sorted(glob(os.path.join(extract_dir, "*"))):
    print(" ", f)

# Try to find data files (common extensions: .csv, .txt)
data_files = glob(os.path.join(extract_dir, "*.csv"))
if not data_files:
    data_files = glob(os.path.join(extract_dir, "**", "*.csv"), recursive=True)
if not data_files:
    # If no CSVs, look for TXT files
    data_files = glob(os.path.join(extract_dir, "*.txt"))
    if not data_files:
        data_files = glob(os.path.join(extract_dir, "**", "*.txt"), recursive=True)

if not data_files:
    raise FileNotFoundError("No data file (.csv or .txt) found in extracted dataset directory.")

print("Found data files:", data_files)

datafile = None
# Prioritize train_data.txt if it exists
for f in data_files:
    if os.path.basename(f) == "train_data.txt":
        datafile = f
        break
if not datafile and data_files: # If train_data.txt not found, take the first available
    datafile = data_files[0]

if not datafile:
    raise FileNotFoundError("No suitable data file to load.")

print("Loading:", datafile)

try:
    # Attempt to read as CSV first (might be tab-separated, etc.)
    df = pd.read_csv(datafile)
except Exception:
    try:
        # If CSV fails, try reading as a text file with a common delimiter like ':::'
        df = pd.read_csv(datafile, sep=':::', engine='python', header=None)
    except Exception as e:
        raise ValueError(f"Could not read data file '{datafile}'. Attempted with default CSV and ':::' delimiter. Error: {e}")

print("Shape:", df.shape)
df.head(5)

PATH: /root/.cache/kagglehub/datasets/hijest/genre-classification-dataset-imdb/versions/1/Genre Classification Dataset
Processing directory from kagglehub...
Copied /root/.cache/kagglehub/datasets/hijest/genre-classification-dataset-imdb/versions/1/Genre Classification Dataset/test_data_solution.txt to /content/dataset/test_data_solution.txt
Copied /root/.cache/kagglehub/datasets/hijest/genre-classification-dataset-imdb/versions/1/Genre Classification Dataset/test_data.txt to /content/dataset/test_data.txt
Copied /root/.cache/kagglehub/datasets/hijest/genre-classification-dataset-imdb/versions/1/Genre Classification Dataset/train_data.txt to /content/dataset/train_data.txt
Copied /root/.cache/kagglehub/datasets/hijest/genre-classification-dataset-imdb/versions/1/Genre Classification Dataset/description.txt to /content/dataset/description.txt
Files in extracted folder:
  /content/dataset/description.txt
  /content/dataset/test_data.txt
  /content/dataset/test_data_solution.txt
  /conten

Unnamed: 0,0,1,2,3
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [8]:
print("Columns:", list(df.columns))

text_col = 3
genre_col = 2

print("Using text column:", text_col)
print("Using genre column:", genre_col)

df = df[[text_col, genre_col]].dropna().rename(columns={text_col: "text", genre_col: "genre"})
print("After dropping NA, shape:", df.shape)
df.head(5)

Columns: [0, 1, 2, 3]
Using text column: 3
Using genre column: 2
After dropping NA, shape: (54214, 2)


Unnamed: 0,text,genre
0,Listening in to a conversation between his do...,drama
1,A brother and sister with a past incestuous r...,thriller
2,As the bus empties the students for their fie...,adult
3,To help their unemployed father make ends mee...,drama
4,The film's title refers not only to the un-re...,drama


In [9]:
# --- Cell 3: handle single-label vs multilabel ---
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

# Inspect a few genre entries
print("Examples of genre values:")
print(df["genre"].sample(8).values)

# If genres contain a delimiter like '|', ',' or ';', treat as multilabel
def detect_delimiter(s):
    for d in ["|", ",", ";", "/"]:
        if d in s:
            return d
    return None

# find first non-null sample to detect delimiter
sample_val = None
for v in df["genre"].astype(str).values:
    if v.strip():
        sample_val = v
        break

delimiter = detect_delimiter(sample_val) if sample_val is not None else None
is_multilabel = delimiter is not None

print("Detected delimiter:", delimiter)
print("Is multilabel:", is_multilabel)

if is_multilabel:
    # split by delimiter and strip whitespace
    df["genre_list"] = df["genre"].astype(str).apply(lambda s: [x.strip() for x in s.split(delimiter) if x.strip()])
    mlb = MultiLabelBinarizer(sparse_output=False)
    Y = mlb.fit_transform(df["genre_list"])
    class_names = mlb.classes_
    print("Number of classes (multilabel):", len(class_names))
else:
    # single-label classification
    # if there are multiple labels concatenated by something else (no delimiter), assume single label strings
    df["genre_clean"] = df["genre"].astype(str).str.strip()
    # filter tiny classes later if needed
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    y = le.fit_transform(df["genre_clean"])
    class_names = le.classes_
    Y = y  # single-label
    print("Number of classes (single-label):", len(class_names))

print("Class names (first 20):", list(class_names)[:20])


Examples of genre values:
[' thriller ' ' thriller ' ' drama ' ' drama ' ' comedy ' ' drama '
 ' documentary ' ' comedy ']
Detected delimiter: None
Is multilabel: False
Number of classes (single-label): 27
Class names (first 20): ['action', 'adult', 'adventure', 'animation', 'biography', 'comedy', 'crime', 'documentary', 'drama', 'family', 'fantasy', 'game-show', 'history', 'horror', 'music', 'musical', 'mystery', 'news', 'reality-tv', 'romance']


In [10]:
# --- Cell 4: split + vectorize ---
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X_text = df["text"].astype(str).values

# split
if is_multilabel:
    X_train, X_test, y_train, y_test = train_test_split(X_text, Y, test_size=0.2, random_state=42, stratify=None)
else:
    X_train, X_test, y_train, y_test = train_test_split(X_text, Y, test_size=0.2, random_state=42, stratify=Y)

print("Train / test sizes:", len(X_train), len(X_test))

# TF-IDF (you can tune ngram_range and max_features)
tfidf = TfidfVectorizer(max_features=40000, ngram_range=(1,2), stop_words="english")
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print("TF-IDF shapes:", X_train_tfidf.shape, X_test_tfidf.shape)


Train / test sizes: 43371 10843
TF-IDF shapes: (43371, 40000) (10843, 40000)


In [11]:
# --- Cell 5: train & evaluate ---
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score, hamming_loss

models = {}

if is_multilabel:
    # For multilabel: MultinomialNB cannot be directly used with OneVsRestClassifier when outputs are binary? It can via OneVsRest.
    models["MultinomialNB_OVR"] = OneVsRestClassifier(MultinomialNB(), n_jobs=-1)
    models["LogisticRegression_OVR"] = OneVsRestClassifier(LogisticRegression(max_iter=1000), n_jobs=-1)
    models["LinearSVC_OVR"] = OneVsRestClassifier(LinearSVC(max_iter=5000), n_jobs=-1)
else:
    models["MultinomialNB"] = MultinomialNB()
    models["LogisticRegression"] = LogisticRegression(max_iter=1000)
    models["LinearSVC"] = LinearSVC(max_iter=5000)

results = {}

for name, model in models.items():
    print("Training", name)
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    if is_multilabel:
        # multilabel metrics: micro/macro f1, hamming loss
        f1_micro = f1_score(y_test, y_pred, average="micro")
        f1_macro = f1_score(y_test, y_pred, average="macro")
        ham = hamming_loss(y_test, y_pred)
        print(f"{name} -- F1 micro: {f1_micro:.4f}, F1 macro: {f1_macro:.4f}, Hamming loss: {ham:.4f}")
        print("Classification report (per class):")
        print(classification_report(y_test, y_pred, target_names=class_names, zero_division=0))
        results[name] = {"f1_micro": f1_micro, "f1_macro": f1_macro, "hamming_loss": ham}
    else:
        acc = accuracy_score(y_test, y_pred)
        f1_micro = f1_score(y_test, y_pred, average="micro")
        f1_macro = f1_score(y_test, y_pred, average="macro")
        print(f"{name} -- Accuracy: {acc:.4f}, F1 micro: {f1_micro:.4f}, F1 macro: {f1_macro:.4f}")
        print("Classification report:")
        print(classification_report(y_test, y_pred, target_names=class_names, zero_division=0))
        results[name] = {"accuracy": acc, "f1_micro": f1_micro, "f1_macro": f1_macro}

print("Done training all models.")


Training MultinomialNB
MultinomialNB -- Accuracy: 0.4749, F1 micro: 0.4749, F1 macro: 0.0705
Classification report:
              precision    recall  f1-score   support

      action       0.00      0.00      0.00       263
       adult       1.00      0.01      0.02       118
   adventure       1.00      0.01      0.01       155
   animation       0.00      0.00      0.00       100
   biography       0.00      0.00      0.00        53
      comedy       0.59      0.23      0.33      1490
       crime       0.00      0.00      0.00       101
 documentary       0.53      0.92      0.68      2619
       drama       0.41      0.86      0.56      2723
      family       0.00      0.00      0.00       157
     fantasy       0.00      0.00      0.00        65
   game-show       0.00      0.00      0.00        39
     history       0.00      0.00      0.00        49
      horror       1.00      0.04      0.07       441
       music       1.00      0.01      0.03       146
     musical       

In [12]:
# --- Cell 6: example predictions and saving ---
import numpy as np
from joblib import dump, load

# choose a model to save (pick LogisticRegression if present)
pick = None
for candidate in ("LogisticRegression_OVR", "LogisticRegression", "LinearSVC_OVR", "LinearSVC"):
    if candidate in models:
        pick = candidate
        break

print("Saving model:", pick)
model_to_save = models[pick]
# Save TF-IDF vectorizer + model together
dump((tfidf, model_to_save, class_names, is_multilabel), "/content/genre_model.joblib")

print("Model saved to /content/genre_model.joblib")

# Function to predict
def predict_genre(texts, topk=3):
    texts = [texts] if isinstance(texts, str) else texts
    Xv = tfidf.transform(texts)
    preds = model_to_save.predict(Xv)
    if is_multilabel:
        # preds is binary matrix
        out = []
        for row in preds:
            labels = [class_names[i] for i, val in enumerate(row) if val==1]
            out.append(labels)
        return out
    else:
        return [class_names[p] for p in preds]

# Try a sample
samples = [
    "A young wizard attends a school of magic and battles dark forces.",
    "A detective investigates a series of murders while dealing with his own demons.",
]
print("Predictions:")
print(predict_genre(samples))


Saving model: LogisticRegression
Model saved to /content/genre_model.joblib
Predictions:
['drama', 'thriller']
