In [9]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import nltk
import math
from tqdm import tqdm
from collections import Counter
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
from nltk.stem import PorterStemmer,LancasterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from scipy.sparse import csr_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score



In [10]:
 #Preprocessing data
    data=pd.read_csv('movie-plots-student.csv')
    data.dropna(axis=0,inplace=True)
    data["Plot"]=data["Plot"].apply(lambda x: x.replace('/',''))
    data["Plot"]=data["Plot"].apply(lambda x: x.replace('\r',''))
    data["Plot"]=data["Plot"].apply(lambda x: x.replace('\n',' '))
    data["Plot"]=data["Plot"].apply(lambda x: x.replace('/s',''))
    data["Plot"]=data["Plot"].apply(lambda x: x.replace('.',''))
    data["Plot"]=data["Plot"].apply(lambda x: x.replace(',',''))
    data["Plot"]=data["Plot"].apply(lambda x: x.replace(':',''))
    data["Plot"]=data["Plot"].apply(lambda x: x.replace(';',''))
    data["Plot"]=data["Plot"].apply(lambda x: x.replace("!",''))
    data["Plot"]=data["Plot"].apply(lambda x: x.replace("?",''))
    data["Plot"]=data["Plot"].apply(lambda x: x.replace("-",''))
    data["Plot"]=data["Plot"].apply(lambda x: x.replace("_",' '))
    data["Plot"]=data["Plot"].apply(lambda x: x.replace("'",''))
    data["Plot"]=data["Plot"].apply(lambda x: x.replace('"',''))
    data["Plot"]=data["Plot"].apply(lambda x: x.replace('(',''))
    data["Plot"]=data["Plot"].apply(lambda x: x.replace(")",''))
    data["Plot"]=data["Plot"].apply(lambda x: x.replace('[',''))
    data["Plot"]=data["Plot"].apply(lambda x: x.replace("]",''))

    data["Plot"]=data["Plot"].apply(lambda x: x.replace('0',''))
    data["Plot"]=data["Plot"].apply(lambda x: x.replace('1',''))
    data["Plot"]=data["Plot"].apply(lambda x: x.replace('2',''))
    data["Plot"]=data["Plot"].apply(lambda x: x.replace('3',''))
    data["Plot"]=data["Plot"].apply(lambda x: x.replace("4",''))
    data["Plot"]=data["Plot"].apply(lambda x: x.replace('5',''))
    data["Plot"]=data["Plot"].apply(lambda x: x.replace('6',''))
    data["Plot"]=data["Plot"].apply(lambda x: x.replace("7",''))
    data["Plot"]=data["Plot"].apply(lambda x: x.replace('8',''))
    data["Plot"]=data["Plot"].apply(lambda x: x.replace("9",''))



In [11]:
    #preprocessing labels
    genres = ['comedy', 'drama', 'horror', 'action']
    labels = np.zeros(len(data["Genre"]), dtype='int8')
    labels[data["Genre"] == "comedy"] = 0
    labels[data["Genre"] == "drama"] = 1
    labels[data["Genre"] == "horror"] = 2
    labels[data["Genre"] == "action"] = 3



In [15]:
    #Removing stop words 
    new_text_data = []
    text_data = data["Plot"].values.tolist()
    wnl=WordNetLemmatizer()
    tokens_full=[[wnl.lemmatize(token.lower()) for token in nltk.tokenize.word_tokenize(text.lower()) if token.isalpha()] for text in text_data]
   
    #STOPWORDS
    counter=Counter(np.concatenate(tokens_full))
    word_freq=counter.most_common(250)
    my_stop_words =[w[0] for w in word_freq]     
    my_stop_words += stopwords.words('english') + ['a', 'the']


    for doc in tqdm(text_data):

        token_i =[token.lower() for token in doc.split(' ') if token.lower() not in my_stop_words]
        new_text = " ".join(token_i)
        new_text_data.append(new_text)
 

100%|██████████| 10716/10716 [00:14<00:00, 750.35it/s]


In [14]:
    # Vectorize text in documents in three different ways:
    vectorizers={'binary':TfidfVectorizer(analyzer='word',binary=True),'bow':CountVectorizer(analyzer='word',binary=False),'tfidf':TfidfVectorizer(analyzer='word',binary=False)}
    X = {} 
    for i, (name,vectorizer) in enumerate(vectorizers.items()):
        X[name]=vectorizer.fit_transform(new_text_data)

    assert len(labels) == len(data), 'label and data length do not match'


In [16]:
    # Train/Test split of the data and labels:
    train=np.random.choice(range(len(labels)),size=int(0.8*len(labels)),replace=False)
    train_y=[labels[i] for i in train]

    test=[i for i in range(len(labels)) if i not in train]
    test_y=[labels[i] for i in test]

    vec_train_X,vec_test_X={},{}
    vec_train_X['binary'] = X['binary'][train]
    vec_train_X['bow']  = X['bow'][train]
    vec_train_X['tfidf'] = X['tfidf'][train]
    vec_test_X['binary'] = X['binary'][test]
    vec_test_X['bow']   = X['bow'][test]
    vec_test_X['tfidf'] = X['tfidf'][test]


    # Note the type of vectorization:
    print(type(vec_train_X['binary']))
    print(type(vec_train_X['bow']))
    print(type(vec_train_X['tfidf']))



<class 'scipy.sparse.csr.csr_matrix'>
<class 'scipy.sparse.csr.csr_matrix'>
<class 'scipy.sparse.csr.csr_matrix'>


In [17]:
    #Model fitting
    models={'binary':BernoulliNB(),'bow':MultinomialNB(),'tfidf':MultinomialNB()}
    predictions={}
    for name,model in models.items():
        model.fit(vec_train_X[name],train_y)
        predictions[name]=model.predict(vec_test_X[name])



In [18]:
    # Generate confusion matrices:
    cms={name:pd.DataFrame(confusion_matrix(test_y,predictions[name]).T,index=[f"predicted {genres[0]}",f"predicted {genres[1]}",f"predicted {genres[2]}",f"predicted {genres[3]}"],columns=[f"true {genres[0]}",f"true {genres[1]}",f"true {genres[2]}",f"true {genres[3]}"]) for name in predictions.keys()}
    f1_binary = f1_score(test_y, predictions['binary'], average='macro')
    print(cms["binary"])
    print(f1_binary)

    f1_bow = f1_score(test_y, predictions['bow'], average='macro')
    print(cms["bow"])
    print(f1_bow)
    
    f1_tfidf = f1_score(test_y, predictions['tfidf'], average='macro')
    print(cms["tfidf"])
    print(f1_tfidf)

                  true comedy  true drama  true horror  true action
predicted comedy          334          89           18           27
predicted drama           384         899           94          115
predicted horror            8          19          102            3
predicted action            4          11            0           37
0.5449413366596612
                  true comedy  true drama  true horror  true action
predicted comedy          514         241           24           36
predicted drama           170         705           31           42
predicted horror           13          18          153            7
predicted action           33          54            6           97
0.6649063421875117


In [19]:
print("BEST ONE IS BOW WITH F1 %f" % f1_bow)

BEST ONE IS BOW WITH F1 0.664906
