In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
train_file = r"C:\Users\USER\Documents\CODSOFT\CODSOFT\Task1_MovieGenreClassification\data\Genre Classification Dataset\train_data.txt"
test_file = r"C:\Users\USER\Documents\CODSOFT\CODSOFT\Task1_MovieGenreClassification\data\Genre Classification Dataset\test_data_solution.txt"
train_data = pd.read_csv(train_file, sep=':::', names=['title', 'genre', 'description'], engine='python')
test_data = pd.read_csv(test_file, sep=':::', names=['title', 'genre', 'description'], engine='python')

#Remove space
train_data['title'] = train_data['title'].astype(str).str.strip()
train_data['genre'] = train_data['genre'].astype(str).str.strip()
train_data['description'] = train_data['description'].astype(str).str.strip()

test_data['title'] = test_data['title'].astype(str).str.strip()
test_data['genre'] = test_data['genre'].astype(str).str.strip()
test_data['description'] = test_data['description'].astype(str).str.strip()

print("Training Data:")
print(train_data.head(10))

print("Testing Data:")
print(test_data.head(10))


Training Data:
                               title        genre  \
1       Oscar et la dame rose (2009)        drama   
2                       Cupid (1997)     thriller   
3   Young, Wild and Wonderful (1980)        adult   
4              The Secret Sin (1915)        drama   
5             The Unrecovered (2007)        drama   
6             Quality Control (2011)  documentary   
7                 "Pink Slip" (2009)       comedy   
8               One Step Away (1985)        crime   
9           "Desperate Hours" (2016)   reality-tv   
10                  Spirits (2014/I)       horror   

                                          description  
1   Listening in to a conversation between his doc...  
2   A brother and sister with a past incestuous re...  
3   As the bus empties the students for their fiel...  
4   To help their unemployed father make ends meet...  
5   The film's title refers not only to the un-rec...  
6   Quality Control consists of a series of 16mm s...  
7   In to

In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 54214 entries, 1 to 54214
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        54214 non-null  object
 1   genre        54214 non-null  object
 2   description  54214 non-null  object
dtypes: object(3)
memory usage: 1.7+ MB


In [4]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 54200 entries, 1 to 54200
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        54200 non-null  object
 1   genre        54200 non-null  object
 2   description  54200 non-null  object
dtypes: object(3)
memory usage: 1.7+ MB


In [5]:
train_data.isnull().sum()
test_data.isnull().sum()

title          0
genre          0
description    0
dtype: int64

In [6]:
train_data['length']=train_data['description'].apply(len)
train_data.head()

Unnamed: 0,title,genre,description,length
1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...,545
2,Cupid (1997),thriller,A brother and sister with a past incestuous re...,183
3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...,649
4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...,1081
5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...,624


In [7]:
train_data.dtypes
test_data.dtypes


title          object
genre          object
description    object
dtype: object

In [8]:
import nltk
nltk.download('stopwords')

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove stopwords 
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    return text

train_data['title'] = train_data['title'].apply(clean_text)
train_data['description'] = train_data['description'].apply(clean_text)

test_data['title'] = test_data['title'].apply(clean_text)
test_data['description'] = test_data['description'].apply(clean_text)


train_data.dropna(subset=['title', 'genre', 'description'], inplace=True)
test_data.dropna(subset=['title', 'genre', 'description'], inplace=True)


print(train_data.head())
print(test_data.head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                   title     genre  \
1  oscar et la dame rose     drama   
2                  cupid  thriller   
3   young wild wonderful     adult   
4             secret sin     drama   
5            unrecovered     drama   

                                         description  length  
1  listening conversation doctor parents yearold ...     545  
2  brother sister past incestuous relationship cu...     183  
3  bus empties students field trip museum natural...     649  
4  help unemployed father make ends meet edith tw...    1081  
5  films title refers unrecovered bodies ground z...     624  
               title        genre  \
1       edgars lunch     thriller   
2  la guerra de papá       comedy   
3       beaten track  documentary   
4    meu amigo hindu        drama   
5         er nu zhai        drama   

                                         description  
1  lr brane loves life car apartment job especial...  
2  spain march quico naughty child three belongin...  
3  on

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Combine title and description into a single text feature
train_data['text'] = train_data['title'] + ' ' + train_data['description']
test_data['text'] = test_data['title'] + ' ' + test_data['description']

# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  
X_train = tfidf_vectorizer.fit_transform(train_data['text'])
X_test = tfidf_vectorizer.transform(test_data['text'])

print(X_train)
print(X_test)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2003977 stored elements and shape (54214, 5000)>
  Coords	Values
  (0, 3163)	0.46680441246150617
  (0, 1535)	0.1405341115498038
  (0, 2535)	0.09962703636907784
  (0, 3835)	0.410895355561455
  (0, 2655)	0.16324421026012875
  (0, 959)	0.13829426524377603
  (0, 1300)	0.11505525457300222
  (0, 3209)	0.09675033116010365
  (0, 4978)	0.11155000768654634
  (0, 2591)	0.10814767972262267
  (0, 3056)	0.1410449308659812
  (0, 1000)	0.13064471239530415
  (0, 4436)	0.10838406133111116
  (0, 4875)	0.12665659810177385
  (0, 2659)	0.1658072631185788
  (0, 1861)	0.15861592044005499
  (0, 3666)	0.1222551995282902
  (0, 4181)	0.1309706790476679
  (0, 206)	0.11984030497857097
  (0, 1569)	0.13675112333539702
  (0, 2543)	0.12000674516983546
  (0, 3313)	0.16420004200124727
  (0, 2835)	0.09066229404246878
  (0, 2164)	0.11712760556547269
  (0, 750)	0.1299544403958376
  :	:
  (54212, 3013)	0.10560382997731643
  (54212, 965)	0.21878502962835478
  (5421

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OrdinalEncoder

vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_data['description'])
X_test = vectorizer.transform(test_data['description'])

#  OrdinalEncoder
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
y_train = encoder.fit_transform(train_data[['genre']]).ravel()
y_test = encoder.transform(test_data[['genre']]).ravel()

print(y_train)
print(y_test)


[ 8. 24.  1. ...  7.  5. 12.]
[24.  5.  7. ...  1.  8.  8.]


In [11]:

genres_mapping = encoder.categories_[0]

for i, genre in enumerate(genres_mapping):
    print(f"{i}: {genre}")


0: action
1: adult
2: adventure
3: animation
4: biography
5: comedy
6: crime
7: documentary
8: drama
9: family
10: fantasy
11: game-show
12: history
13: horror
14: music
15: musical
16: mystery
17: news
18: reality-tv
19: romance
20: sci-fi
21: short
22: sport
23: talk-show
24: thriller
25: war
26: western


In [12]:
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import label_binarize
from imblearn.over_sampling import SMOTE

#class imbalance using SMOTE
smote = SMOTE()
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Evaluate model
lr_model = LogisticRegression(max_iter=1000, solver='liblinear')
lr_model.fit(X_train_balanced, y_train_balanced)
lr_predictions = lr_model.predict(X_test)
lr_report = classification_report(y_test, lr_predictions, zero_division=0, output_dict=True)
y_test_binarized = label_binarize(y_test, classes=list(set(y_train)))
lr_probabilities = lr_model.predict_proba(X_test)
lr_roc_auc = roc_auc_score(y_test_binarized, lr_probabilities, multi_class='ovr')
print("Classification Report:")
print(lr_report)
print(f"ROC AUC Score: {lr_roc_auc}")
accuracy = accuracy_score(y_test, lr_predictions)
print(f' Accuracy = {accuracy:.4f}')




Classification Report:
{'0.0': {'precision': 0.2961599158337717, 'recall': 0.4284627092846271, 'f1-score': 0.35023328149300154, 'support': 1314.0}, '1.0': {'precision': 0.33098591549295775, 'recall': 0.5576271186440678, 'f1-score': 0.4154040404040404, 'support': 590.0}, '2.0': {'precision': 0.17906976744186046, 'recall': 0.29806451612903223, 'f1-score': 0.22372881355932203, 'support': 775.0}, '3.0': {'precision': 0.16129032258064516, 'recall': 0.25100401606425704, 'f1-score': 0.19638648860958366, 'support': 498.0}, '4.0': {'precision': 0.0416141235813367, 'recall': 0.125, 'f1-score': 0.06244087038789026, 'support': 264.0}, '5.0': {'precision': 0.5785344683692205, 'recall': 0.46548482406661296, 'f1-score': 0.5158889633102627, 'support': 7446.0}, '6.0': {'precision': 0.11625874125874126, 'recall': 0.2633663366336634, 'f1-score': 0.16130988477865374, 'support': 505.0}, '7.0': {'precision': 0.7890441701519942, 'recall': 0.5906383628588882, 'f1-score': 0.6755753526354863, 'support': 13096.0