In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re  # for pattern matching and text manipulation.
import string 
import nltk 
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer as CV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score 

In [2]:
train_data_path = 'C:\\Users\\Acer\\Downloads\\Genre Classification Dataset\\train_data.txt'
test_data_path = 'C:\\Users\\Acer\\Downloads\\Genre Classification Dataset\\test_data.txt'

In [3]:
# Reading the training data
train_data = pd.read_csv(train_data_path, delimiter=' ::: ', engine='python', names=["Id", "Movie", "Genre", "Description"])
test_data = pd.read_csv(test_data_path, delimiter=' ::: ', engine='python', names=['Id', 'Movie', 'Description'])

In [4]:
train_data.head()


Unnamed: 0,Id,Movie,Genre,Description
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


In [5]:
test_data.head()

Unnamed: 0,Id,Movie,Description
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apart..."
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty chi..."
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family o...
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with his..."
4,5,Er nu zhai (1955),Before he was known internationally as a marti...


In [6]:
train_data.isnull().sum()

Id             0
Movie          0
Genre          0
Description    0
dtype: int64

In [7]:
test_data.isnull().sum()

Id             0
Movie          0
Description    0
dtype: int64

In [8]:
test_data.shape

(54200, 3)

In [9]:
train_data.duplicated().sum()

0

In [10]:
test_data.duplicated().sum()

0

In [11]:
train_data.drop(columns=['Movie'], inplace = True)
test_data.drop(columns=['Movie'], inplace = True)
train_data.drop(columns=['Id'], inplace = True)
test_data.drop(columns=['Id'], inplace = True)


In [12]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54214 entries, 0 to 54213
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Genre        54214 non-null  object
 1   Description  54214 non-null  object
dtypes: object(2)
memory usage: 847.2+ KB


In [13]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54200 entries, 0 to 54199
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Description  54200 non-null  object
dtypes: object(1)
memory usage: 423.6+ KB


In [14]:
test_data.shape

(54200, 1)

In [15]:
train_data.shape

(54214, 2)

In [16]:
nltk.download('stopwords') 
nltk.download('punkt')     

def clean_description(text):
    text = text.lower()  
    text = re.sub(r'@\S+', '', text)  
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'pic.\S+', '', text) 
    text = re.sub(r"[^a-zA-Z\s']", ' ', text) 
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english')) 
    text = ' '.join([word for word in tokens if word not in stop_words and len(word) > 2])  # Remove stopwords and keep words longer than 2 characters
    return text


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
train_data['Description_cleaned'] = train_data['Description'].apply(clean_description)
test_data['Description_cleaned'] = test_data['Description'].apply(clean_description)


In [18]:
# Ensure that the 'Genre' column is still present
print(train_data.columns)


Index(['Genre', 'Description', 'Description_cleaned'], dtype='object')


In [19]:
st = PorterStemmer()
train_data['Description_cleaned'] = train_data['Description_cleaned'].apply(
    lambda x: ' '.join([st.stem(word) for word in x.split()]))

test_data['Description_cleaned'] = test_data['Description_cleaned'].apply(
    lambda x: ' '.join([st.stem(word) for word in x.split()]))

In [20]:
train_data['length'] = train_data['Description'].apply(len)
train_data['length_cleaned']=train_data['Description_cleaned'].apply(len)
train_data.head()

Unnamed: 0,Genre,Description,Description_cleaned,length,length_cleaned
0,drama,Listening in to a conversation between his doc...,listen convers doctor parent year old oscar le...,545,339
1,thriller,A brother and sister with a past incestuous re...,brother sister past incestu relationship curre...,183,111
2,adult,As the bus empties the students for their fiel...,bu empti student field trip museum natur histo...,649,343
3,drama,To help their unemployed father make ends meet...,help unemploy father make end meet edith twin ...,1081,678
4,drama,The film's title refers not only to the un-rec...,film titl refer recov bodi ground zero also st...,624,347


In [21]:
X_train,X_test,y_train,y_test = train_test_split(train_data['Description_cleaned'],train_data['Genre'],random_state=42,test_size=0.2)

In [22]:
print(X_train)

19253    sex betray seduct manipul addict love hate van...
12261    autist teen toni friendship cameron boy still ...
37143    one hour special examin social ethnic religi e...
21965    person narr documentari use bicycl ride work v...
7033     easi go villag policeman artturi sakari reinik...
                               ...                        
11284    david cronenberg well known canadian director ...
44732    adan neumann member jewish famili live berlin ...
38158    miranda lesbian photograph guayaquil live buen...
860      enemi profession live chiang chih heng 'melvin...
15795    fashion design malan breton revisit birth rite...
Name: Description_cleaned, Length: 43371, dtype: object


In [23]:
# using TF-IDF
vectorize = TfidfVectorizer(stop_words='english', max_features=100000)
train_tfidf = vectorize.fit_transform(X_train)
test_tfidf = vectorize.transform(X_test)
ectorizer = TfidfVectorizer(stop_words='english', max_features=100000)

In [24]:
lr_reg = LogisticRegression(max_iter=1000, solver='lbfgs')
lr_reg.fit(train_tfidf,y_train)
y_pred_lr = lr_reg.predict(test_tfidf)
lr_accuracy = accuracy_score(y_test, y_pred_lr)
print(lr_accuracy)

0.5795444065295582


In [25]:
NB_model = MultinomialNB()
NB_model.fit(train_tfidf, y_train)
y_pred_nb = NB_model.predict(test_tfidf)
nb_accuracy= accuracy_score(y_test, y_pred_nb)
print(nb_accuracy)

0.44065295582403397


In [26]:
from sklearn.tree import DecisionTreeClassifier
classifier_tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier_tree.fit(train_tfidf, y_train)
y_pred_tree = classifier_tree.predict(test_tfidf)
tree_accuracy = accuracy_score(y_test, y_pred_tree)
print(tree_accuracy)

0.3474130775615605


In [27]:
from sklearn.neighbors import KNeighborsClassifier
classifier_kn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier_kn.fit(train_tfidf, y_train)
y_pred_kn = classifier_kn.predict(test_tfidf)
kn_accuracy= accuracy_score(y_test, y_pred_kn)
print(kn_accuracy)

0.3909434658304897


In [28]:
from sklearn.ensemble import RandomForestClassifier
classifier_rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier_rf.fit(train_tfidf, y_train)
y_pred_rf = classifier_rf.predict(test_tfidf)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print(rf_accuracy)

0.4273724983860555


In [None]:
from sklearn.svm import SVC
classifier_svm = SVC(kernel = 'linear', random_state = 0)
classifier_svm.fit(train_tfidf, y_train)
y_pred_svm = classifier_svm.predict(test_tfidf)
svm_accuracy = accuracy_score(y_test, y_pred_svm)
print(svm_accuracy)

In [None]:
import matplotlib.pyplot as plt

# Assuming you have already computed the accuracies for each classifier
accuracies = {
    'Logistic Regression': lr_accuracy,
    'Naive Bayes': nb_accuracy,
    'Decision Tree': tree_accuracy,
    'K-Nearest Neighbors': kn_accuracy,
    'Random Forest': rf_accuracy,
    'SVM': svm_accuracy
}

# Create a bar plot for the classifier accuracies
fig, ax = plt.subplots()
ax.bar(accuracies.keys(), accuracies.values(), color='orange')
ax.set_xlabel('Classifier')
ax.set_ylabel('Accuracy')
ax.set_title('Classifier Accuracy Comparison')
plt.xticks(rotation=45)
plt.ylim(0, 1)  # Accuracy ranges from 0 to 1

# Show the plot
plt.tight_layout()
plt.show()
