In [1]:
import tensorflow as tf
import pandas as pd 
import os

In [2]:
gpus=tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu,True)

In [3]:
for dirname, _, filenames in os.walk("C:/Users/igloo/Documents/deep learning projects/movies/archive/Genre Classification Dataset"):
  for filename in filenames:
    print(os.path.join(dirname, filename))

C:/Users/igloo/Documents/deep learning projects/movies/archive/Genre Classification Dataset\description.txt
C:/Users/igloo/Documents/deep learning projects/movies/archive/Genre Classification Dataset\test_data.txt
C:/Users/igloo/Documents/deep learning projects/movies/archive/Genre Classification Dataset\test_data_solution.txt
C:/Users/igloo/Documents/deep learning projects/movies/archive/Genre Classification Dataset\train_data.txt


In [4]:
def load_and_preprocess_data(filepath, is_test=False):
  data=pd.read_csv(filepath, delimiter=' ::: ', engine='python',header=None)
  if is_test:
    data.columns=["ID", "TITLE", "DESCRIPTION"]
  else:
    data.columns=["ID", "TITLE", "GENRE", "DESCRIPTION"]
  data["DESCRIPTION"]=data["DESCRIPTION"].str.lower().str.replace('[^\w\s]','' ,regex=True)
  return data

In [5]:
train_data=load_and_preprocess_data("C:/Users/igloo/Documents/deep learning projects/movies/archive/Genre Classification Dataset/train_data.txt")
test_data=load_and_preprocess_data("C:/Users/igloo/Documents/deep learning projects/movies/archive/Genre Classification Dataset/test_data.txt", is_test=True)
test_solution_data=load_and_preprocess_data("C:/Users/igloo/Documents/deep learning projects/movies/archive/Genre Classification Dataset/test_data_solution.txt")

In [6]:
X_train = train_data['DESCRIPTION']
y_train = train_data['GENRE']
X_test = test_data['DESCRIPTION']
y_test = test_solution_data['GENRE']

In [7]:
X_train

0        listening in to a conversation between his doc...
1        a brother and sister with a past incestuous re...
2        as the bus empties the students for their fiel...
3        to help their unemployed father make ends meet...
4        the films title refers not only to the unrecov...
                               ...                        
54209    this shortlived nbc live sitcom centered on bo...
54210    the next generation of exploitation the sister...
54211    ze bestaan echt is a standup comedy about grow...
54212    walter and vivian live in the country and have...
54213    on labor day weekend 1935 the most intense hur...
Name: DESCRIPTION, Length: 54214, dtype: object

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [9]:
from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

In [10]:
from sklearn.metrics import accuracy_score
nb_predictions = nb_model.predict(X_test_tfidf)

In [11]:
print(f"Accuracy: {accuracy_score(y_test, nb_predictions)*100:.4f}%")

Accuracy: 44.3948%


In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
lr_model = LogisticRegression(max_iter=1000, solver='sag',penalty='l2', tol=1e-3)
lr_model.fit(X_train_tfidf, y_train)

In [15]:
from sklearn.metrics import accuracy_score
lr_predictions = lr_model.predict(X_test_tfidf)
print(f"Accuracy: {accuracy_score(y_test, lr_predictions)*100:.4f}%")

Accuracy: 58.6513%
