### Movie Genre Prediction
This project involves building a text classification model to predict movie genres from plot summaries. By leveraging NLP techniques, the text data is preprocessed and transformed into a format suitable for machine learning algorithms.

## Data Loading

In [26]:
!pip install imbalanced-learn


Collecting imbalanced-learn
  Downloading imbalanced_learn-0.12.2-py3-none-any.whl.metadata (8.2 kB)
Downloading imbalanced_learn-0.12.2-py3-none-any.whl (257 kB)
   ---------------------------------------- 0.0/258.0 kB ? eta -:--:--
   - -------------------------------------- 10.2/258.0 kB ? eta -:--:--
   ---- ---------------------------------- 30.7/258.0 kB 435.7 kB/s eta 0:00:01
   ------------- ------------------------- 92.2/258.0 kB 871.5 kB/s eta 0:00:01
   ------------------------------- -------- 204.8/258.0 kB 1.4 MB/s eta 0:00:01
   ---------------------------------------- 258.0/258.0 kB 1.6 MB/s eta 0:00:00
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.12.2


In [27]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

In [4]:
# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\youse\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\youse\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\youse\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
# Load train and test datasets
train_data = pd.read_parquet(r"C:\Users\youse\Desktop\CodeClause\archive (5)\train-00000-of-00001-b943ea66e0040b18.parquet")
test_data = pd.read_parquet(r"C:\Users\youse\Desktop\CodeClause\archive (5)\test-00000-of-00001-35e9a9274361daed.parquet")

In [6]:
# Display the first few rows of the datasets
train_data.head()

Unnamed: 0,id,movie_name,synopsis,genre
0,44978,Super Me,A young scriptwriter starts bringing valuable ...,fantasy
1,50185,Entity Project,A director and her friends renting a haunted h...,horror
2,34131,Behavioral Family Therapy for Serious Psychiat...,This is an educational video for families and ...,family
3,78522,Blood Glacier,Scientists working in the Austrian Alps discov...,scifi
4,2206,Apat na anino,Buy Day - Four Men Widely - Apart in Life - By...,action


In [7]:
# Display the first few rows of the datasets
test_data.head()

Unnamed: 0,id,movie_name,synopsis,genre
0,16863,A Death Sentence,"12 y.o. Ida's dad'll die without a DKK1,500,00...",action
1,48456,Intermedio,A group of four teenage friends become trapped...,action
2,41383,30 Chua Phai Tet,A guy left his home for 12 years till he came ...,action
3,84007,Paranoiac,A man long believed dead returns to the family...,action
4,40269,Ordinary Happiness,"After a deadly accident, Paolo comes back on E...",action


In [8]:
# Display the columns of the datasets
print(train_data.columns)
print(test_data.columns)

Index(['id', 'movie_name', 'synopsis', 'genre'], dtype='object')
Index(['id', 'movie_name', 'synopsis', 'genre'], dtype='object')


In [9]:
# Check for null values in train_data
null_train = train_data.isnull().sum()
print("Null values in train_data:")
print(null_train)

Null values in train_data:
id            0
movie_name    0
synopsis      0
genre         0
dtype: int64


In [10]:
# Check for null values in test_data
null_test = test_data.isnull().sum()
print("\nNull values in test_data:")
print(null_test)


Null values in test_data:
id            0
movie_name    0
synopsis      0
genre         0
dtype: int64


In [11]:
# Check for duplicates in train_data
duplicate_train = train_data.duplicated().sum()
print("\nDuplicates in train_data:", duplicate_train)



Duplicates in train_data: 0


In [12]:
# Check for duplicates in test_data
duplicate_test = test_data.duplicated().sum()
print("Duplicates in test_data:", duplicate_test)

Duplicates in test_data: 0


##  Text Preprocessing

In [16]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Text Preprocessing
def preprocess_text(text):
    if isinstance(text, str):
        # Lowercase conversion
        text = text.lower()
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        # Tokenization
        tokens = word_tokenize(text)
        # Remove stopwords
        tokens = [word for word in tokens if word not in stopwords.words('english')]
        # Lemmatization
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        return ' '.join(tokens)
    else:
        return ''
    
# Load train and test datasets
train_data = pd.read_parquet(r"C:\Users\youse\Desktop\CodeClause\archive (5)\train-00000-of-00001-b943ea66e0040b18.parquet")
test_data = pd.read_parquet(r"C:\Users\youse\Desktop\CodeClause\archive (5)\test-00000-of-00001-35e9a9274361daed.parquet")

# Apply preprocessing to synopsis text in train and test datasets
train_data['cleaned_synopsis'] = train_data['synopsis'].apply(preprocess_text)
test_data['cleaned_synopsis'] = test_data['synopsis'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\youse\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\youse\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\youse\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [28]:
# Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(train_data['cleaned_synopsis'])
y = train_data['genre']

# Check class distribution
print(train_data['genre'].value_counts())

# Use SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

genre
fantasy      5400
horror       5400
family       5400
scifi        5400
action       5400
crime        5400
adventure    5400
mystery      5400
romance      5400
thriller     5400
Name: count, dtype: int64


## Feature Engineering

In [29]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

## Model Building

In [30]:
# Train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

## Model Evaluation

In [31]:
# Predictions and evaluation
y_pred = model.predict(X_val)
print('Accuracy:', accuracy_score(y_val, y_pred))
print('Classification Report:\n', classification_report(y_val, y_pred, zero_division=1))
print('Confusion Matrix:\n', confusion_matrix(y_val, y_pred))

Accuracy: 0.34879629629629627
Classification Report:
               precision    recall  f1-score   support

      action       0.28      0.25      0.26      1094
   adventure       0.27      0.23      0.25      1067
       crime       0.37      0.39      0.38      1134
      family       0.39      0.45      0.42      1049
     fantasy       0.30      0.28      0.29      1057
      horror       0.40      0.42      0.41      1116
     mystery       0.29      0.28      0.29      1074
     romance       0.48      0.58      0.52      1075
       scifi       0.39      0.44      0.41      1077
    thriller       0.21      0.16      0.19      1057

    accuracy                           0.35     10800
   macro avg       0.34      0.35      0.34     10800
weighted avg       0.34      0.35      0.34     10800

Confusion Matrix:
 [[271 139 198  73  69  51  50  59 106  78]
 [127 250  57 154 133  40  42  76 142  46]
 [156  47 437  47  21  39 150  92  22 123]
 [ 48 103  37 471 124  21  39 130  52  

In [32]:
# Evaluate on test data
X_test = vectorizer.transform(test_data['cleaned_synopsis'])
y_test = test_data['genre']  # Assuming you have the actual genres for the test set
y_test_pred = model.predict(X_test)

print('Test Accuracy:', accuracy_score(y_test, y_test_pred))
print('Test Classification Report:\n', classification_report(y_test, y_test_pred, zero_division=1))
print('Test Confusion Matrix:\n', confusion_matrix(y_test, y_test_pred))

Test Accuracy: 0.0885
Test Classification Report:
               precision    recall  f1-score   support

      action       1.00      0.09      0.16     36000
   adventure       0.00      1.00      0.00         0
       crime       0.00      1.00      0.00         0
      family       0.00      1.00      0.00         0
     fantasy       0.00      1.00      0.00         0
      horror       0.00      1.00      0.00         0
     mystery       0.00      1.00      0.00         0
     romance       0.00      1.00      0.00         0
       scifi       0.00      1.00      0.00         0
    thriller       0.00      1.00      0.00         0

    accuracy                           0.09     36000
   macro avg       0.10      0.91      0.02     36000
weighted avg       1.00      0.09      0.16     36000

Test Confusion Matrix:
 [[3186 3144 3762 4101 3098 4002 3537 4400 4065 2705]
 [   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [   0 