In [18]:
import pandas as pd 
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [5]:
try:
    train_df = pd.read_csv("train_data.txt",sep = ':::',header = None,names = ['ID','Title','Genre','Description'],engine = 'python')
    print("train_df loaded successfully.")

    test_df = pd.read_csv('test_data.txt',sep=':::',header = None,names=['ID','Title','Description'],engine = 'python')
    print("test_df loaded successfully.")

    test_sol_df = pd.read_csv("test_data_solution.txt",sep=':::',header = None,names=['ID','Title','Genre','Description'],engine='python')
    print("test_data_sol loaded successfully")

except Error as e:
    print(f"Error")

train_df loaded successfully.
test_df loaded successfully.
test_data_sol loaded successfully


In [19]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Anish\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anish\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Anish\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
train_df.head()

Unnamed: 0,ID,Title,Genre,Description
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [9]:
test_df.head()

Unnamed: 0,ID,Title,Description
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar..."
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch..."
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family ...
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi..."
4,5,Er nu zhai (1955),Before he was known internationally as a mart...


In [11]:
test_sol_df.head()

Unnamed: 0,ID,Title,Genre,Description
0,1,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apar..."
1,2,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty ch..."
2,3,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family ...
3,4,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with hi..."
4,5,Er nu zhai (1955),drama,Before he was known internationally as a mart...


In [8]:
train_df.columns
test_df.columns
test_sol_df.columns

Index(['ID', 'Title', 'Genre', 'Description'], dtype='object')

In [12]:
train_df.isnull().sum()

ID             0
Title          0
Genre          0
Description    0
dtype: int64

In [13]:
test_df.isnull().sum()

ID             0
Title          0
Description    0
dtype: int64

In [14]:
test_sol_df.isnull().sum()

ID             0
Title          0
Genre          0
Description    0
dtype: int64

In [15]:
X_train_text = train_df['Description']
y_test = test_sol_df['Genre']

In [16]:
X_train_text.shape[0]

54214

In [17]:
X_test_text = test_sol_df['Description']
y_train = train_df['Genre']

In [20]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [21]:
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = nltk.word_tokenize(text)
    processed_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 2]
    return ' '.join(processed_tokens)

In [22]:
 X_train_processed = X_train_text.apply(preprocess_text)

In [23]:
X_test_processed = X_test_text.apply(preprocess_text)

In [24]:
X_train_processed.head()

0    listening conversation doctor parent yearold o...
1    brother sister past incestuous relationship cu...
2    bus empty student field trip museum natural hi...
3    help unemployed father make end meet edith twi...
4    film title refers unrecovered body ground zero...
Name: Description, dtype: object

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
if 'X_train_processed' in locals() and 'X_test_processed' in locals() and X_train_processed is not None and X_test_processed is not None:
    tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english') 
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_processed)
    X_test_tfidf = tfidf_vectorizer.transform(X_test_processed)
    print(f"Shape of TF-IDF matrix for training data (X_train_tfidf): {X_train_tfidf.shape}")
    print(f"Shape of TF-IDF matrix for test data (X_test_tfidf): {X_test_tfidf.shape}")

else:
    print("\nError: Processed text data (X_train_processed or X_test_processed) not found or is None. Cannot apply TF-IDF.")
    X_train_tfidf, X_test_tfidf = None, None 

Shape of TF-IDF matrix for training data (X_train_tfidf): (54214, 5000)
Shape of TF-IDF matrix for test data (X_test_tfidf): (54200, 5000)


In [26]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC 
from sklearn.metrics import accuracy_score, classification_report

if ('X_train_tfidf' in locals() and 'y_train' in locals() and
    'X_test_tfidf' in locals() and 'y_test' in locals() and
    X_train_tfidf is not None and X_test_tfidf is not None):

    print("\nTraining Multinomial Naive Bayes...")
    nb_classifier = MultinomialNB()
    nb_classifier.fit(X_train_tfidf, y_train)

    print("Predicting on test data (Naive Bayes)...")
    y_pred_nb = nb_classifier.predict(X_test_tfidf)

    accuracy_nb = accuracy_score(y_test, y_pred_nb)
    print(f"Multinomial Naive Bayes Accuracy: {accuracy_nb:.4f}")
    print("\nClassification Report (Naive Bayes):\n", classification_report(y_test, y_pred_nb, zero_division=0))

    print("\nTraining Logistic Regression...")
    lr_classifier = LogisticRegression(max_iter=1000, C=1.0, solver='liblinear', random_state=42)
    lr_classifier.fit(X_train_tfidf, y_train)

    print("Predicting on test data (Logistic Regression)...")
    y_pred_lr = lr_classifier.predict(X_test_tfidf)

    accuracy_lr = accuracy_score(y_test, y_pred_lr)
    print(f"Logistic Regression Accuracy: {accuracy_lr:.4f}")
    print("\nClassification Report (Logistic Regression):\n", classification_report(y_test, y_pred_lr, zero_division=0))

    print("\nTraining Linear SVM (LinearSVC)...")
    svm_classifier = LinearSVC(C=1.0, random_state=42, dual=False, max_iter=1000) 
    svm_classifier.fit(X_train_tfidf, y_train)

    print("Predicting on test data (Linear SVM)...")
    y_pred_svm = svm_classifier.predict(X_test_tfidf)

    accuracy_svm = accuracy_score(y_test, y_pred_svm)
    print(f"Linear SVM Accuracy: {accuracy_svm:.4f}")

    print("\n--- Detailed Classification Report for Linear SVM ---")
    print(classification_report(y_test, y_pred_svm, zero_division=0))


else:
    print("\nError: TF-IDF matrices or labels are missing. Cannot train models.")


Training Multinomial Naive Bayes...
Predicting on test data (Naive Bayes)...
Multinomial Naive Bayes Accuracy: 0.5224

Classification Report (Naive Bayes):
                precision    recall  f1-score   support

      action        0.53      0.10      0.17      1314
       adult        0.51      0.08      0.14       590
   adventure        0.79      0.10      0.17       775
   animation        0.00      0.00      0.00       498
   biography        0.00      0.00      0.00       264
      comedy        0.52      0.44      0.48      7446
       crime        0.00      0.00      0.00       505
 documentary        0.57      0.87      0.68     13096
       drama        0.46      0.82      0.59     13612
      family        0.50      0.00      0.00       783
     fantasy        0.00      0.00      0.00       322
   game-show        0.98      0.27      0.43       193
     history        0.00      0.00      0.00       243
      horror        0.69      0.35      0.47      2204
       music    