# **Project**

# 1. Imports

In [40]:
import numpy as np
import pandas as pd
import nltk
import re
import string
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import StratifiedKFold
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.metrics import classification_report, accuracy_score

nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# 2. Dataset load

In [41]:
# Add the dataset the sample_data folder (on the left side)

# Load the dataset
train = pd.read_csv('/content/sample_data/train.txt', sep='\t', names=['title', 'from', 'genre', 'director', 'plot'])

X = train['plot']
y = train['genre']
#x_test = test['title', 'form', 'director', 'plot']


# Inspect the first few rows
print(train.head())

                       title       from    genre       director  \
0               Ela Cheppanu     Telugu  romance         Ramana   
1  A Nightmare on Elm Street   American   horror   Samuel Bayer   
2            American Gothic   American   horror     John Hough   
3                       Gang  Bollywood    crime    Mazhar Khan   
4         Intimate Relations    British    drama  Charles Frank   

                                                plot  
0  Sekhar (Tarun) is a graduate from IIM and work...  
1  Kris Fowles (Katie Cassidy) goes to the Spring...  
2  Cynthia is traumatized by the death of her bab...  
3  Four friends, Gangu (Jackie Shroff), Abdul (Na...  
4  Crisis in a middle-class family when the son f...  


# 3. Pre-processing

In [42]:
# Pre-processing elements

stop = stopwords.words('english')
including = ['no', 'nor', 'not', 'but', 'against', 'only']
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def apply_preprocessing(text):

  lowered = text.lower()
  # Remove everything that is not a word or inside parenthesis
  lowered_re = re.sub(r'\(.*?\)|[^a-zA-Z\s]', '', lowered)
  tokens = word_tokenize(lowered_re, "english")

  # Remove ponctuation
  for token in tokens:
        if(all(char in string.punctuation for char in token)):
            tokens.remove(token)

  filtered_tokens = [word for word in tokens if word not in stop]

  lemmatizer = WordNetLemmatizer()
  processed_tokens = [stemmer.stem(lemmatizer.lemmatize(token)) for token in filtered_tokens]

  return ' '.join(processed_tokens)


# Apply preprocessing
X = X.apply(apply_preprocessing)

# spliting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.125, random_state=1)



print(X.head())

0    sekhar graduat iim work busi manag firm hydera...
1    kri fowl go springwood diner meet exboyfriend ...
2    cynthia traumat death babi leav bathtub accide...
3    four friend gangu abdul nihal gari form word g...
4    crisi middleclass famili son fall love father ...
Name: plot, dtype: object


# 4. Apply Naive Bayes

In [43]:
# Create a pipeline with custom tokenizer
nb_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

# Parameters for grid search
parameters = {
    'vect__max_features': [1000, 3000, 5000, 10000, 15000, 20000],
    'tfidf__use_idf': [True, False],
    'clf__alpha': [0.1, 0.2, 0.5, 1.0],
}

# Stratified K-Fold cross-validation
skf = StratifiedKFold(n_splits=3)

# Perform Grid Search
grid_search = GridSearchCV(nb_pipeline, parameters, cv=skf)
grid_search.fit(X_train, y_train)

  _data = np.array(data, dtype=dtype, copy=copy,


{'clf__alpha': 0.1, 'tfidf__use_idf': False, 'vect__max_features': 5000}
Classification Report:
              precision    recall  f1-score   support

      action       0.60      0.56      0.58        95
   animation       0.91      0.70      0.79        57
      comedy       0.63      0.48      0.55       126
       crime       0.61      0.33      0.43        57
       drama       0.45      0.66      0.54       162
      horror       0.74      0.85      0.79       112
     romance       0.61      0.61      0.61       102
      sci-fi       0.73      0.28      0.40        29
     western       0.95      0.95      0.95        95

    accuracy                           0.64       835
   macro avg       0.69      0.60      0.63       835
weighted avg       0.66      0.64      0.64       835

Accuracy Score: 0.6407185628742516


# 5. Evaluation


In [44]:
# Get the best classifier and make predictions
best_classifier = grid_search.best_estimator_
y_test_pred = best_classifier.predict(X_test)

# Display best parameters
print(grid_search.best_params_)

# Classification report and accuracy score
print("Classification Report:")
print(classification_report(y_test, y_test_pred))
print("Accuracy Score:", accuracy_score(y_test, y_test_pred))

{'clf__alpha': 0.1, 'tfidf__use_idf': False, 'vect__max_features': 5000}
Classification Report:
              precision    recall  f1-score   support

      action       0.60      0.56      0.58        95
   animation       0.91      0.70      0.79        57
      comedy       0.63      0.48      0.55       126
       crime       0.61      0.33      0.43        57
       drama       0.45      0.66      0.54       162
      horror       0.74      0.85      0.79       112
     romance       0.61      0.61      0.61       102
      sci-fi       0.73      0.28      0.40        29
     western       0.95      0.95      0.95        95

    accuracy                           0.64       835
   macro avg       0.69      0.60      0.63       835
weighted avg       0.66      0.64      0.64       835

Accuracy Score: 0.6407185628742516
