In [3]:
import pandas as pd
import numpy as np

In [4]:
def load_data(file_path):
    with open(file_path, "r" , encoding = 'utf-8') as f:
        data = f.readlines()
    data = [line.strip().split(":::") for line in data]
    return data

In [5]:
train_data = load_data("train_data.txt")

train_df = pd.DataFrame(train_data , columns=["ID" , "TITLE" ,"GENRE" , "DESCRIPTION"])

test_data = load_data("test_data.txt")

test_df = pd.DataFrame(test_data , columns=["ID" , "TITLE"  , "DESCRIPTION"])

test_sol = load_data("test_data_solution.txt")
test_sol_df = pd.DataFrame(test_sol , columns=["ID" , "TITLE" , "GENRE" , "DESCRIPTION"])


In [None]:
train_df.head()

Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [None]:
test_df.head()

Unnamed: 0,ID,TITLE,DESCRIPTION
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar..."
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch..."
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family ...
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi..."
4,5,Er nu zhai (1955),Before he was known internationally as a mart...


# New Section

In [None]:
test_sol_df.head()

Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apar..."
1,2,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty ch..."
2,3,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family ...
3,4,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with hi..."
4,5,Er nu zhai (1955),drama,Before he was known internationally as a mart...


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=10000)

X_train_tfidf = vectorizer.fit_transform(train_df["DESCRIPTION"])
X_test_tfidf = vectorizer.transform(test_df["DESCRIPTION"])


In [None]:
X_train_tfidf.shape

(54214, 10000)

In [7]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df["GENRE"])
y_test = label_encoder.transform(test_sol_df["GENRE"])

In [None]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)

y_pred = lr_model.predict(X_test_tfidf)
predicted_genres = label_encoder.inverse_transform(y_pred)

test_df["Predicted_Genre"] = predicted_genres
test_df.head()

Unnamed: 0,ID,TITLE,DESCRIPTION,Predicted_Genre
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar...",drama
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch...",drama
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family ...,documentary
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi...",drama
4,5,Er nu zhai (1955),Before he was known internationally as a mart...,drama


In [None]:
merge_df = pd.merge(test_sol_df[["ID","GENRE"]] , test_df[["ID","Predicted_Genre"]] , on="ID")

merge_df.head()

Unnamed: 0,ID,GENRE,Predicted_Genre
0,1,thriller,drama
1,2,comedy,drama
2,3,documentary,documentary
3,4,drama,drama
4,5,drama,drama


In [None]:
from sklearn.metrics import accuracy_score , classification_report
accuracy = accuracy_score(merge_df["GENRE"] , merge_df["Predicted_Genre"])
print(f"Accuracy: {accuracy:.3f}")
print("Classification Report:\n", classification_report(merge_df["GENRE"] , merge_df["Predicted_Genre"]))

Accuracy: 0.595


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report:
                precision    recall  f1-score   support

      action        0.50      0.29      0.37      1314
       adult        0.65      0.24      0.35       590
   adventure        0.67      0.16      0.25       775
   animation        0.61      0.04      0.08       498
   biography        0.00      0.00      0.00       264
      comedy        0.54      0.60      0.57      7446
       crime        0.41      0.03      0.06       505
 documentary        0.68      0.87      0.76     13096
       drama        0.55      0.79      0.65     13612
      family        0.49      0.08      0.14       783
     fantasy        0.61      0.03      0.06       322
   game-show        0.90      0.49      0.64       193
     history        0.00      0.00      0.00       243
      horror        0.66      0.57      0.61      2204
       music        0.68      0.46      0.55       731
     musical        0.44      0.01      0.03       276
     mystery        0.33      0.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from sklearn.metrics import accuracy_score , classification_report
accuracy_nb = accuracy_score(merge_df["GENRE"] , merge_df["Predicted_Genre"])
print(f"Accuracy: {accuracy_nb:.3f}")
print("Classification Report:\n", classification_report(merge_df["GENRE"] , merge_df["Predicted_Genre"]))

Accuracy: 0.595


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report:
                precision    recall  f1-score   support

      action        0.50      0.29      0.37      1314
       adult        0.65      0.24      0.35       590
   adventure        0.67      0.16      0.25       775
   animation        0.61      0.04      0.08       498
   biography        0.00      0.00      0.00       264
      comedy        0.54      0.60      0.57      7446
       crime        0.41      0.03      0.06       505
 documentary        0.68      0.87      0.76     13096
       drama        0.55      0.79      0.65     13612
      family        0.49      0.08      0.14       783
     fantasy        0.61      0.03      0.06       322
   game-show        0.90      0.49      0.64       193
     history        0.00      0.00      0.00       243
      horror        0.66      0.57      0.61      2204
       music        0.68      0.46      0.55       731
     musical        0.44      0.01      0.03       276
     mystery        0.33      0.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
from sklearn.svm import LinearSVC
model = LinearSVC()
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)
predicted_genres = label_encoder.inverse_transform(y_pred)

test_df["Predicted_Genre"] = predicted_genres
test_df.head()


Unnamed: 0,ID,TITLE,DESCRIPTION,Predicted_Genre
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar...",drama
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch...",drama
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family ...,documentary
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi...",drama
4,5,Er nu zhai (1955),Before he was known internationally as a mart...,drama


In [9]:
merge_df = pd.merge(test_sol_df[["ID","GENRE"]] , test_df[["ID","Predicted_Genre"]] , on="ID")

merge_df.head()

Unnamed: 0,ID,GENRE,Predicted_Genre
0,1,thriller,drama
1,2,comedy,drama
2,3,documentary,documentary
3,4,drama,drama
4,5,drama,drama


In [10]:
from sklearn.metrics import accuracy_score , classification_report
accuracy = accuracy_score(merge_df["GENRE"] , merge_df["Predicted_Genre"])
print(f"Accuracy: {accuracy:.3f}")
print("Classification Report:\n", classification_report(merge_df["GENRE"] , merge_df["Predicted_Genre"]))

Accuracy: 0.583
Classification Report:
                precision    recall  f1-score   support

      action        0.41      0.33      0.36      1314
       adult        0.59      0.42      0.49       590
   adventure        0.42      0.20      0.27       775
   animation        0.33      0.15      0.20       498
   biography        0.00      0.00      0.00       264
      comedy        0.55      0.59      0.57      7446
       crime        0.21      0.07      0.10       505
 documentary        0.70      0.83      0.76     13096
       drama        0.57      0.71      0.63     13612
      family        0.33      0.15      0.21       783
     fantasy        0.25      0.08      0.12       322
   game-show        0.79      0.64      0.70       193
     history        0.18      0.02      0.03       243
      horror        0.60      0.61      0.61      2204
       music        0.59      0.50      0.54       731
     musical        0.29      0.07      0.12       276
     mystery        0.21