In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import sqlite3
import csv
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
#from wordcloud import WordCloud
import re
import os
from sqlalchemy import create_engine # database connection
import datetime as dt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.metrics import f1_score,precision_score,recall_score
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from skmultilearn.adapt import mlknn
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import LabelPowerset
from sklearn.naive_bayes import GaussianNB
from datetime import datetime

In [2]:
data = pd.read_csv('mpst_full_data.csv')
data.head()

Unnamed: 0,imdb_id,title,plot_synopsis,tags,split,synopsis_source
0,tt0057603,I tre volti della paura,Note: this synopsis is for the orginal Italian...,"cult, horror, gothic, murder, atmospheric",train,imdb
1,tt1733125,Dungeons & Dragons: The Book of Vile Darkness,"Two thousand years ago, Nhagruul the Foul, a s...",violence,train,imdb
2,tt0033045,The Shop Around the Corner,"Matuschek's, a gift store in Budapest, is the ...",romantic,test,imdb
3,tt0113862,Mr. Holland's Opus,"Glenn Holland, not a morning person by anyone'...","inspiring, romantic, stupid, feel-good",train,imdb
4,tt0086250,Scarface,"In May 1980, a Cuban man named Tony Montana (A...","cruelty, murder, dramatic, cult, violence, atm...",val,imdb


In [5]:
# function for cleaning the plots of the movies
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    #text = re.sub('\W', ' ', text)
    #text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

# function for text cleaning 
def cleaned(text):
    # remove backslash-apostrophe 
    text = re.sub("\'", "", text) 
    # remove everything except alphabets 
    text = re.sub("[^a-zA-Z]"," ",text) 
    # remove whitespaces 
    text = ' '.join(text.split()) 
    # convert text to lowercase 
    text = text.lower() 
    
    return text
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# function to remove stopwords
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)



[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ankan_rokr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
data['cleaned'] = list(data['plot_synopsis'].apply(clean_text))

In [7]:
data['cleaned'] = data['cleaned'].apply(lambda x: remove_stopwords(x))

In [8]:
data['cleaned'] = list(data['cleaned'].apply(cleaned))

In [9]:
data.head()

Unnamed: 0,imdb_id,title,plot_synopsis,tags,split,synopsis_source,cleaned
0,tt0057603,I tre volti della paura,Note: this synopsis is for the orginal Italian...,"cult, horror, gothic, murder, atmospheric",train,imdb,note synopsis orginal italian release segments...
1,tt1733125,Dungeons & Dragons: The Book of Vile Darkness,"Two thousand years ago, Nhagruul the Foul, a s...",violence,train,imdb,two thousand years ago nhagruul foul sorcerer ...
2,tt0033045,The Shop Around the Corner,"Matuschek's, a gift store in Budapest, is the ...",romantic,test,imdb,matuschek gift store budapest workplace alfred...
3,tt0113862,Mr. Holland's Opus,"Glenn Holland, not a morning person by anyone'...","inspiring, romantic, stupid, feel-good",train,imdb,glenn holland morning person anyone standards ...
4,tt0086250,Scarface,"In May 1980, a Cuban man named Tony Montana (A...","cruelty, murder, dramatic, cult, violence, atm...",val,imdb,may cuban man named tony montana al pacino cla...


In [10]:
from sklearn.decomposition import LatentDirichletAllocation

cv = CountVectorizer(min_df = 8,
                     max_features = 100000,
                     analyzer = "word",
                     ngram_range = (1, 4),
                     stop_words = "english",
                     token_pattern = '[a-zA-Z]')

count_vectors = cv.fit_transform(data["cleaned"])



In [11]:
count_vectors

<14828x100000 sparse matrix of type '<class 'numpy.int64'>'
	with 46744622 stored elements in Compressed Sparse Row format>

In [None]:
from sklearn.model_selection import GridSearchCV


start = datetime.now()
params = {"n_components": [2, 5, 8, 10, 15, 20, 25, 30, 35, 40, 45, 50]
#           "max_iter": [10,20,30,40,50,60,70,80,90,100],
#           "learning_decay": [0.5,0.6,0.7,0.8,0.9,1]}
# lda_model = LatentDirichletAllocation(n_components = n_topics,                                       
#                                        learning_method = "online",
#                                        batch_size = 128,
#                                        evaluate_every = -1,
#                                        max_iter = 20,
#                                        random_state = 32,
#                                        n_jobs = -1)
lda_model = LatentDirichletAllocation( learning_method = "online",
                                       batch_size = 128,
                                       evaluate_every = -1,
                                       random_state = 32,
                                       n_jobs = -1)

model = GridSearchCV(lda_model, param_grid = params, n_jobs = -1)
model.fit(count_vectors)
          
print("Time taken to run this cell :", datetime.now() - start)

In [None]:
model.best_estimator_

In [12]:
start = datetime.now()

n_topics = 8
lda_model = LatentDirichletAllocation(n_components = n_topics, 
                                      learning_method = "online",
                                      batch_size = 128,
                                      evaluate_every = -1,
                                      max_iter = 20,
                                      random_state = 32,
                                      n_jobs = -1)

question_topics = lda_model.fit_transform(count_vectors)
temp = question_topics

          
print("Time taken to run this cell :", datetime.now() - start)

Time taken to run this cell : 1:47:39.621000


In [13]:
temp

array([[4.21510721e-01, 1.14340920e-01, 5.38535493e-03, ...,
        3.10273182e-02, 1.22439561e-01, 4.40270379e-02],
       [6.56418933e-01, 1.67538105e-01, 5.95324551e-02, ...,
        9.66894901e-03, 3.19041155e-05, 3.19080672e-05],
       [1.74585618e-01, 2.84758397e-02, 1.83389544e-02, ...,
        1.71019435e-05, 1.84483274e-01, 3.55468506e-01],
       ...,
       [9.06105700e-02, 6.31183345e-01, 5.09778096e-03, ...,
        2.37839601e-01, 4.16866247e-03, 3.10505165e-02],
       [2.61427630e-01, 1.51165386e-01, 4.24101554e-03, ...,
        5.72352252e-02, 4.32756316e-01, 3.10020292e-02],
       [1.19717811e-02, 5.08269490e-05, 1.43845035e-05, ...,
        1.43844343e-05, 1.33644461e-01, 4.28095898e-01]])

In [15]:
np.save('topic', temp)

In [16]:
len(temp)

14828

In [17]:
print("Log Likelihood: {} \nPerplexity: {}".format(lda_model.score(count_vectors), 
                                                   lda_model.perplexity(count_vectors)))

Log Likelihood: -1163686349.1415052 
Perplexity: 2651.648460507434


In [19]:
!pip3 install tsne

Collecting tsne
  Downloading https://files.pythonhosted.org/packages/cc/57/87d99c7c3da6e25dc7a34b7d305cf5a4f5850b78edd2a5e12de2254a1155/tsne-0.1.8.tar.gz
    Complete output from command python setup.py egg_info:
    Traceback (most recent call last):
      File "<string>", line 1, in <module>
      File "/tmp/pip-build-n_hg1_eb/tsne/setup.py", line 18, in <module>
        from Cython.Distutils import build_ext
    ImportError: No module named 'Cython'
    
    ----------------------------------------
[31mCommand "python setup.py egg_info" failed with error code 1 in /tmp/pip-build-n_hg1_eb/tsne/[0m


In [24]:
train_data =  data['split']=='train'
train = data[train_data]

   
    
test_data =  data['split']=='test'
test = data[test_data]

  
    
validation_data =  data['split']=='val'
val = data[validation_data]


In [61]:
cv_train = CountVectorizer(min_df = 8,
                     max_features = 100000,
                     analyzer = "word",
                     ngram_range = (1, 4),
                     stop_words = "english",
                     token_pattern = '[a-zA-Z]')

count_vectors_train = cv_train.fit_transform(train["cleaned"])



In [62]:
count_vectors_train 

<9489x99400 sparse matrix of type '<class 'numpy.int64'>'
	with 29729531 stored elements in Compressed Sparse Row format>

In [63]:
cv_test = CountVectorizer(min_df = 8,
                     max_features = 100000,
                     analyzer = "word",
                     ngram_range = (1, 4),
                     stop_words = "english",
                     token_pattern = '[a-zA-Z]')

count_vectors_test = cv_test.transform(test["cleaned"])

NotFittedError: CountVectorizer - Vocabulary wasn't fitted.

In [None]:
count_vectors_test

In [29]:
start = datetime.now()

n_topics = 8
lda_model = LatentDirichletAllocation(n_components = n_topics, 
                                      learning_method = "online",
                                      batch_size = 128,
                                      evaluate_every = -1,
                                      max_iter = 20,
                                      random_state = 32,
                                      n_jobs = -1)

question_topics_train = lda_model.fit_transform(count_vectors_train)

          
print("Time taken to run this cell :", datetime.now() - start)

Time taken to run this cell : 1:11:43.103894


In [33]:
start = datetime.now()

n_topics = 8
lda_model = LatentDirichletAllocation(n_components = n_topics, 
                                      learning_method = "online",
                                      batch_size = 128,
                                      evaluate_every = -1,
                                      max_iter = 20,
                                      random_state = 32,
                                      n_jobs = -1)

question_topics_test = lda_model.fit_transform(count_vectors_test)

          
print("Time taken to run this cell :", datetime.now() - start)

Time taken to run this cell : 0:19:58.543799


In [34]:
np.save('topic', question_topics_test)

In [35]:
np.save('topic_train', question_topics_train)

In [36]:
vectorizer_tags = CountVectorizer(tokenizer = lambda x: x.split(','), binary='true')
y_train = vectorizer_tags.fit_transform(train['tags'])
y_test = vectorizer_tags.transform(test['tags'])

In [58]:
question_topics_train = np.load('topic.npy')

In [64]:
features_train = np.array(cv_train.get_feature_names())

In [None]:
!pip3 install mglear

In [65]:
import mglearn
mglearn.tools.print_topics(topics=range(5), feature_names=features_train,
sorting=sorting, topics_per_chunk=5, n_words=10)

ImportError: No module named 'mglearn'

In [57]:
idx = np.amax(question_topics_train, axis = 1)<0.5
train = temp[idx]

IndexError: boolean index did not match indexed array along dimension 0; dimension is 14828 but corresponding boolean dimension is 2966

In [52]:
idx

array([ True, False,  True, ..., False,  True,  True])

In [53]:
new_insincere2 = train[["tags", "plot_synopsis"]].copy()
new_insincere2 = new_insincere2[idx]

ValueError: Item wrong length 14828 instead of 9489.

In [42]:
question_topics = np.matrix(question_topics)
doc_topics = question_topics/question_topics.sum(axis = 1)

lda_keys = []
for i, tweet in enumerate(new_insincere2["plot_synopsis"]):
    lda_keys += [doc_topics[i].argmax()]
    
tsne_lda_df3 = pd.DataFrame(tsne_lda3, columns = ["x", "y"])
tsne_lda_df3["tags"] = new_insincere2["tags"].values
tsne_lda_df3["plot"] = new_insincere2["plot_synopsis"].values
# tsne_lda_df3["topics"] = lda_keys
# tsne_lda_df3["topics"] = tsne_lda_df2["topics"].map(int)

NameError: name 'new_insincere2' is not defined

In [41]:
start = datetime.now()
classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.01, penalty='l2', class_weight="balanced"), n_jobs=-1)
classifier.fit(question_topics_train,y_train)
predictions = classifier.predict (question_topics_test)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

ValueError: Found input variables with inconsistent numbers of samples: [1586, 9489]