In [3]:
#Load the Training Data
train_texts = []
train_labels = []

with open("train_data.txt", "r", encoding="utf-8") as file:
    for line in file:
        # Split using " ::: " which separates the columns
        parts = line.strip().split(" ::: ")
        if len(parts) == 4:
            _, title, genre, plot = parts
            train_labels.append(genre)
            train_texts.append(plot)

print(f"✅ Loaded {len(train_texts)} training plots.")
print("Example:\n", train_texts[0], "\n=> Genre:", train_labels[0])


✅ Loaded 54214 training plots.
Example:
 Listening in to a conversation between his doctor and parents, 10-year-old Oscar learns what nobody has the courage to tell him. He only has a few weeks to live. Furious, he refuses to speak to anyone except straight-talking Rose, the lady in pink he meets on the hospital stairs. As Christmas approaches, Rose uses her fantastical experiences as a professional wrestler, her imagination, wit and charm to allow Oscar to live life and love to the full, in the company of his friends Pop Corn, Einstein, Bacon and childhood sweetheart Peggy Blue. 
=> Genre: drama


In [5]:
# Load the Test Data
test_texts = []

with open("test_data.txt", "r", encoding="utf-8") as file:
    for line in file:
        parts = line.strip().split(" ::: ")
        if len(parts) == 3:
            _, title, plot = parts
            test_texts.append(plot)

print(f"✅ Loaded {len(test_texts)} test plots.")
print("Example test plot:\n", test_texts[0])


✅ Loaded 54200 test plots.
Example test plot:
 L.R. Brane loves his life - his car, his apartment, his job, but especially his girlfriend, Vespa. One day while showering, Vespa runs out of shampoo. L.R. runs across the street to a convenience store to buy some more, a quick trip of no more than a few minutes. When he returns, Vespa is gone and every trace of her existence has been wiped out. L.R.'s life becomes a tortured existence as one strange event after another occurs to confirm in his mind that a conspiracy is working against his finding Vespa.


In [6]:
#Convert Text into Numbers using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# Create the vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Convert training and test texts into numeric vectors
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

# Check the shape (how many rows and features)
print("✅ TF-IDF Vectorization done.")
print("Training data shape:", X_train.shape)
print("Test data shape:", X_test.shape)


✅ TF-IDF Vectorization done.
Training data shape: (54214, 5000)
Test data shape: (54200, 5000)


In [7]:
#Convert Genres to Numbers (Label Encoding)
from sklearn.preprocessing import LabelEncoder

# Convert genre labels to numbers
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_labels)

# Print to check what each number means
print("✅ Genres converted to numbers.")
print("Genre classes:", list(label_encoder.classes_))


✅ Genres converted to numbers.
Genre classes: ['action', 'adult', 'adventure', 'animation', 'biography', 'comedy', 'crime', 'documentary', 'drama', 'family', 'fantasy', 'game-show', 'history', 'horror', 'music', 'musical', 'mystery', 'news', 'reality-tv', 'romance', 'sci-fi', 'short', 'sport', 'talk-show', 'thriller', 'war', 'western']


In [9]:
#Train the Model
from sklearn.linear_model import LogisticRegression

# Create and train the model
model = LogisticRegression(max_iter=1000)  # max_iter ensures it trains fully
model.fit(X_train, y_train)

print("✅ Model training complete.")


✅ Model training complete.


In [10]:

#Predict genres for test data
y_pred = model.predict(X_test)

# Convert the numeric predictions back to genre names
predicted_genres = label_encoder.inverse_transform(y_pred)

# Show first 5 predictions
print("✅ Prediction complete.")
print("Example predictions:", predicted_genres[:5])


✅ Prediction complete.
Example predictions: ['short' 'drama' 'documentary' 'drama' 'drama']


In [11]:
# Save predictions to a text file (one genre per line)
with open("predictions.txt", "w", encoding="utf-8") as f:
    for genre in predicted_genres:
        f.write(genre + "\n")

print("✅ Predictions saved to predictions.txt")


✅ Predictions saved to predictions.txt


In [12]:
# Load actual test genres
true_labels = []

with open("test_data_solution.txt", "r", encoding="utf-8") as file:
    for line in file:
        true_labels.append(line.strip())

print("✅ Loaded", len(true_labels), "true test labels.")


✅ Loaded 54200 true test labels.


In [3]:
# Load saved predictions from prediction.txt
with open("predictions.txt", "r", encoding="utf-8") as file:
    predicted_genres = [line.strip() for line in file if line.strip()]

print("✅ Loaded", len(predicted_genres), "predicted genres.")
print("Example predictions:", predicted_genres[:5])


✅ Loaded 54200 predicted genres.
Example predictions: ['short', 'drama', 'documentary', 'drama', 'drama']


In [6]:
# Reload the true labels from test_data_solution.txt
with open("test_data_solution.txt", "r", encoding="utf-8") as file:
    true_labels = [line.strip() for line in file if line.strip()]

print("✅ Loaded", len(true_labels), "true labels.")
print("Example true labels:", true_labels[:5])


✅ Loaded 54200 true labels.
Example true labels: ["1 ::: Edgar's Lunch (1998) ::: thriller ::: L.R. Brane loves his life - his car, his apartment, his job, but especially his girlfriend, Vespa. One day while showering, Vespa runs out of shampoo. L.R. runs across the street to a convenience store to buy some more, a quick trip of no more than a few minutes. When he returns, Vespa is gone and every trace of her existence has been wiped out. L.R.'s life becomes a tortured existence as one strange event after another occurs to confirm in his mind that a conspiracy is working against his finding Vespa.", '2 ::: La guerra de papá (1977) ::: comedy ::: Spain, March 1964: Quico is a very naughty child of three belonging to a wealthy middle-class family. Since Cristina\'s birth, he feels he has lost the privileged position of "prince" of the house for his eight months old sister. So, with his brother Juan, who is eight years old and is quite disobedient, spend their time committing prank after 

In [8]:
# Extract the genre from the raw true label lines
clean_true_labels = []
for line in true_labels:
    parts = line.split(":::")
    if len(parts) >= 3:
        genre = parts[2].strip()
        clean_true_labels.append(genre)

print("✅ Extracted", len(clean_true_labels), "clean genres.")
print("Example genres:", clean_true_labels[:5])


✅ Extracted 54200 clean genres.
Example genres: ['thriller', 'comedy', 'documentary', 'drama', 'drama']


In [9]:
#Accuracy Check
from sklearn.metrics import accuracy_score

# Sample accuracy on first 5000
sample_true = clean_true_labels[:5000]
sample_pred = predicted_genres[:5000]

accuracy = accuracy_score(sample_true, sample_pred)
print(f"✅ Sample Accuracy (first 5000): {accuracy * 100:.2f}%")

✅ Sample Accuracy (first 5000): 59.26%


In [11]:
# accuracy on 10,000 
sample_true = clean_true_labels[:10000]
sample_pred = predicted_genres[:10000]

accuracy = accuracy_score(sample_true, sample_pred)
print(f"✅ Sample Accuracy (first 10000): {accuracy * 100:.2f}%")


✅ Sample Accuracy (first 10000): 57.98%


In [12]:
#Save Final Predictions to File
with open("final_predictions.txt", "w", encoding="utf-8") as f:
    for genre in predicted_genres:
        f.write(genre + "\n")

print("✅ Predictions saved to final_predictions.txt")


✅ Predictions saved to final_predictions.txt
