# Naive Bayes Project
### University of Denver 
## Isabel Osgood

In [1]:
import pandas as pd 
import imdb
import numpy as np

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import *
from sklearn.metrics import *

import re
import html
from nltk.corpus import stopwords
import string
from IPython.display import Audio
sound_file = "./sounds/CantinaBand60.wav"

### Gather Data 

In [2]:
ia = imdb.IMDb()

df = pd.DataFrame(columns = ['title', 'plot', 'genre'])


In [3]:
r_results = []
r_results += ia.get_top250_movies() # adding the top 250 cause surely some are romantic-comedies
r_results += ia.get_bottom100_movies() # honestly probably even more are in the bottom 100
# none of these have anywhere close to 1000 results, but just to make sure 
# we get them all if they're there
r_results += ia.get_keyword('romantic-comedy', results=1000)
r_results += ia.get_keyword('teen-movie', results=1000)
r_results += ia.get_keyword('falling-in-love', results=1000)
r_results += ia.get_keyword('chick-flick', results=1000)
r_results += ia.get_keyword('romantic-comedy-spoof', results=1000)
r_results += ia.get_keyword('wedding', results=1000)
r_results += ia.get_keyword('opposites-attract', results=1000)
r_results += ia.get_keyword('sexual-comedy', results=1000)
r_results += ia.get_keyword('love', results=1000)
r_results += ia.get_keyword('love-interest', results=1000)
r_results += ia.get_keyword('cliche', results=1000)


for r in r_results:
    movie = ia.get_movie(r.movieID)
    # Make sure no action accidentally fall in here
    genre = None
    if movie.data.get('genres') and 'Action' not in movie.data.get('genres') and 'Romance' in movie.data.get('genres') and 'Comedy' in movie.data.get('genres'):
        genre = 'romantic-comedy'
        
        plot = None
        if movie.get('plot'):
            plot = movie.get('plot')[0]
            
        df.loc[movie.movieID] = [movie.data.get('title'), plot, genre]

In [4]:
a_results = []
a_results += ia.get_top250_movies() # adding the top 250 cause surely some are romantic-comedies
a_results += ia.get_bottom100_movies() # honestly probably even more are in the bottom 100
# none of these have anywhere close to 1000 results, but just to make sure 
# we get them all if they're there
a_results += ia.get_keyword('action', results=1000)
a_results += ia.get_keyword('action-hero', results=2000)
a_results += ia.get_keyword('action-heroine', results=1000)
a_results += ia.get_keyword('good-versus-evil', results=1000)
a_results += ia.get_keyword('action-violence', results=1000)
a_results += ia.get_keyword('monster-movie', results=1000)
a_results += ia.get_keyword('violence', results=1000)
a_results += ia.get_keyword('superhero', results=1000)
a_results += ia.get_keyword('hero', results=1000)
a_results += ia.get_keyword('gunfight', results=1000)
a_results += ia.get_keyword('fight', results=1000)

for a in a_results:
    movie = ia.get_movie(a.movieID)
    # Make sure no romantic comedy accidentally fall in here
    genre = None
    if movie.data.get('genres') and 'Action' in movie.data.get('genres') and 'Romance' not in movie.data.get('genres') and 'Comedy' not in movie.data.get('genres'):
        genre = 'action'
        
        plot = None
        if movie.get('plot'):
            plot = movie.get('plot')[0]
            
        df.loc[movie.movieID] = [movie.data.get('title'), plot, genre]

In [5]:
#check to make sure it worked 
df.head()

Unnamed: 0,title,plot,genre
118799,Life Is Beautiful,When an open-minded Jewish librarian and his s...,romantic-comedy
27977,Modern Times,The Tramp struggles to live in modern industri...,romantic-comedy
21749,City Lights,"With the aid of a wealthy erratic tippler, a d...",romantic-comedy
45152,Singin' in the Rain,A silent film production company and cast make...,romantic-comedy
211915,Amélie,Amélie is an innocent and naive girl in Paris ...,romantic-comedy


#### Data preprocessing 

In [6]:
df_af = df.loc[df['genre'] == 'action']
df_rc = df.loc[df['genre'] == 'romantic-comedy']

print(len(df_af))
print(len(df_rc))
print(df.isna().sum())
df2 = df_af.append(df_rc)
df2 = df2.dropna()
# Shuffle
np.random.seed(22021)
df2 = df2.reindex(np.random.permutation(df2.index))

# print to show success

Audio(filename = sound_file, autoplay=True) #play a sound when finishes 


196
109
title    0
plot     0
genre    0
dtype: int64


In [8]:
def clean(text, stopwords):  
    # remove tags like <tab>
    text = re.sub(r'<[^<>]*>', ' ', text)
     # split text on whitespace
    text_list = text.split()
    text_words = []

    punctuation = set(string.punctuation)

 # keep #tags and @mentions
 ## punctuation.remove("#")
 ## punctuation.remove("@")
    for word in text_list:
 # remove punctuation marks at the beginning
 # of each word
         while len(word) > 0 and word[0] in punctuation:
             word = word[1:]

 # remove punctuation marks at the end of each word
         while len(word) > 0 and word[-1] in punctuation:
             word = word[:-1]

 # a rule to eliminate most urls
         if len(word) > 0 and "/" not in word:
 # eliminate stopwords
             if word.lower() not in stopwords:
 # append the word to the text_words list
                 text_words.append(word.lower())
         cleaner_text = " ".join(text_words)
    return cleaner_text

In [10]:
sw = set(stopwords.words("english"))

In [11]:
print(len(df2)) #original length

df3 = df2[df2["plot"].str.len() > 60] #drop under 60 words 

df3['plot'] = df3['plot'].apply(str)
print(len(df3)) #length after droping shorter than 60
df3['clean_plot'] = df3['plot'].apply(clean, stopwords=sw)
df3.dropna()
print(len(df3)) #length after dropping na after cleaning
df3.head()

305
300
300


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,title,plot,genre,clean_plot
458339,Captain America: The First Avenger,"Steve Rogers, a rejected military soldier, tra...",action,steve rogers rejected military soldier transfo...
1570728,"Crazy, Stupid, Love.",A middle-aged husband's life changes dramatica...,romantic-comedy,middle-aged husband's life changes dramaticall...
118799,Life Is Beautiful,When an open-minded Jewish librarian and his s...,romantic-comedy,open-minded jewish librarian son become victim...
4294052,Attack on Titan Part 2,A young boy name Eren Jaeger has to use his gi...,action,young boy name eren jaeger use gift know myste...
7363104,Khuda Haafiz,A newly married couple's life falls in jeopard...,action,newly married couple's life falls jeopardy wif...


#### Split Data

In [12]:
X, y = df3['clean_plot'], df3['genre']

tfidf = TfidfVectorizer(ngram_range=(1,2), stop_words="english", min_df=10, max_features=None)
X_tfidf= tfidf.fit_transform(X)

X_train,X_test, y_train,y_test = train_test_split(X_tfidf, 
                                                  y,
                                                  test_size = 0.3, random_state=42)

#### Model Building 

In [23]:
#print(MultinomialNB().get_params())

param_grid = [{"alpha":[0.2, 0.4, 0.8, 1.0], 
              "fit_prior":[True, False]}]

grid = GridSearchCV(estimator=MultinomialNB() , param_grid =param_grid, cv=3, n_jobs = 10)
grid.fit(X_train, y_train)

Audio(filename = sound_file, autoplay=True)

#### Model Evaluation

In [24]:
#best estimator of the paramater grid 
print("Best estimator Parameters:")
print(grid.best_params_, "\n")
print("\n Training Accuracy:")
print(grid.score(X_train, y_train))
print("\n Testing Accuracy:")
print(grid.score(X_test, y_test))

Best estimator Parameters:
{'alpha': 0.8, 'fit_prior': True} 


 Training Accuracy:
0.8047619047619048

 Testing Accuracy:
0.8111111111111111
