In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk 
import pickle
import swifter 
%matplotlib inline
import gc

import torch
from torch import nn
from torch.autograd import Variable
from torch.utils.data import DataLoader
import torch.nn.functional as F
import time 
import scipy

torch.manual_seed(42)

<torch._C.Generator at 0x7f116fde5ad0>

In [53]:
### CONFIG ###
# WORDS
valid_tags = ['JJ','JJR','JJS','NN','NNS','NNP','NNPS','RB','RBR','RBS','VB','VBD','VBG','VBN','VBP','VBZ','WP','WP$','WRB']

data = pd.read_csv("wiki_movie_plots_deduped.csv")
data = pd.concat([
    data[data['Genre'].isin(['action', 'thriller', 'romance', 'war', 'horror', 'comedy', 'adventure', 'crime', 'sport'])]
    #data[data['Genre'].isin(['comedy'])].head(1000)
])
print(data.shape)
data = data[['Genre', 'Plot']]
data = data[data['Genre'] != 'unknown']
data['Genre'] = data['Genre'].map(lambda x: str(x).lower().strip().split(','))
unique_genres = np.unique([y.strip() for x in data['Genre'].values.tolist() for y in x if y != '' and y != ' '])

def get_genre_array(x, unique_genres):
    cur_array = np.zeros(len(unique_genres))
    for cur_genre in x:
        cur_genre = cur_genre.strip()
        if cur_genre != '' and cur_genre != ' ':
            cur_array[np.where(unique_genres==cur_genre)[0][0]]=1
    return cur_array
data['genre_array'] = data['Genre'].map(lambda x: get_genre_array(x, unique_genres))

data['Plot'] = data['Plot'].map(lambda x: str(x).lower())
print(data.shape)

def refine_tokens(cur_plot, valid_tags, stopwords):
    sentences = cur_plot.split('.')
    filtered_sentences = []
    for cur_sentence in sentences:
        words_list = nltk.word_tokenize(cur_sentence)
        words_list = [w for w in words_list if w not in stopwords.words()] 
        words_list = nltk.pos_tag(words_list)
        words_list = [x[0] for x in words_list if x[1] in valid_tags]
        filtered_sentences.append(' '.join(words_list))
    return ' '.join(filtered_sentences)
print("Data Created")

(9375, 8)
(9375, 3)
Data Created


In [54]:
########################
## DATA 1
########################

%time data['plot_array'] = data.swifter.apply(lambda x: refine_tokens(x['Plot'], valid_tags, stopwords), axis=1)

print("Write the data")
pickle.dump(data, open("processed_data.pickle", "wb"))

vectorizer = CountVectorizer(max_features=10000, ngram_range=(1,3), max_df=50, min_df=1)
data['plot_array'] = vectorizer.fit_transform(data['plot_array'].values.tolist()).todense().tolist()
temp_arr = np.array(data['plot_array'].values.tolist())
temp_arr[temp_arr > 1] = 1
data['plot_array'] = temp_arr.tolist()

print("Write the data")
pickle.dump(data, open("processed_data.pickle", "wb"))
pickle.dump(vectorizer, open("vectorizer.pickle", "wb"))

del(temp_arr)
gc.collect()

print("Features prepared")

Dask Apply:   0%|          | 0/8 [00:00<?, ?it/s]

CPU times: user 4min 57s, sys: 22.7 s, total: 5min 19s
Wall time: 1h 9min 4s
Write the data
Write the data
Features prepared


In [38]:
# RESTART
data = pickle.load(open("processed_data.pickle", "rb"))

In [58]:
'''
########################
## DATA 2
########################
def found(search_word, cur_list):
    for cur_word in cur_list:
        if search_word in cur_word:
            return 1
    return 0

data = pickle.load(open("processed_data.pickle", "rb"))
data['is_action_genre'] = data['Genre'].map(lambda x: found('action', x))
print(data['is_action_genre'].value_counts())
gc.collect()
'''


['comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'horror',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'horror',
 'crime',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'romance',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'romance',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'romance',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'horror',
 'comedy',
 'comedy',
 'romance',
 'comedy',
 'comedy',
 'comedy',
 'comedy',
 'crime',
 'comedy',
 'romance',
 'comedy',
 'comedy',
 'comed

In [63]:
# Model Architecture

class classifier_model(nn.Module):
    def __init__(self, label_count, initial_features):
        super(classifier_model, self).__init__()
        self.l1 = nn.Linear(initial_features, 512)
        self.l2 = nn.Linear(512, 256)
        self.l3 = nn.Linear(256, 128)
        self.l4 = nn.Linear(128, 64)
        self.l5 = nn.Linear(64, label_count)
        self.b1 = nn.BatchNorm1d(512)
        self.b2 = nn.BatchNorm1d(256)
        self.b3 = nn.BatchNorm1d(128)
        self.b4 = nn.BatchNorm1d(64)
        self.b5 = nn.BatchNorm1d(label_count)
        #self.sigm = nn.Sigmoid()
        self.smax = nn.Softmax()
        
    def forward(self, x):
        self.l1d = self.l1(x)
        self.b1d = self.b1(self.l1d)
        self.l2d = self.l2(F.leaky_relu(self.b1d))
        self.b2d = self.b2(self.l2d)
        self.l3d = self.l3(F.leaky_relu(self.b2d))
        self.b3d = self.b3(self.l3d)
        self.l4d = self.l4(F.leaky_relu(self.b3d))
        self.b4d = self.b4(self.l4d)
        self.l5d = self.l5(F.leaky_relu(self.b4d))
        self.b5d = self.b5(self.l5d)
        self.smaxd = self.smax(self.b5d)
        return self.smaxd
    
print("Model created")

Model created


In [64]:
#######################
# MODEL
#######################

# Config    
num_epochs = 2000
learning_rate = 1e-3
batch_size = 16
label_count = 8
model = classifier_model(label_count, 10000).cuda()
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

data_input = Variable(torch.from_numpy(np.array(data['plot_array'].values.tolist()).astype(np.float32))).cuda()
#data_output = Variable(torch.from_numpy(np.array(data['is_action_genre'].values.tolist()).astype(np.float32))).cuda()
data_output = Variable(torch.from_numpy(np.array(data['genre_array'].values.tolist()).astype(np.float32))).cuda()
batches = int(data_input.size()[0]/batch_size)

print("Model object created and config also created")

Model object created and config also created


In [66]:
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
start_time = time.time()
for epoch in range(100000):
    for cur_batch in range(batches):
        model.zero_grad()
        model_output = model(data_input[batch_size*cur_batch : batch_size * (cur_batch+1)]).view(batch_size, label_count)
        loss = criterion(model_output, data_output[batch_size*cur_batch : batch_size * (cur_batch+1)])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if epoch % 10 == 0:
        print('epoch [{}/{}], loss:{:.4f} time_taken: {}'.format(epoch, num_epochs, loss, time.time() - start_time))
    if epoch % 100 == 0:
        print("Writing the mode at epoch {0}".format(epoch))
        torch.save(model, "self_model_action_noaction")
torch.save(model, "self_model_action_noaction")

  self.smaxd = self.smax(self.b5d)


epoch [0/2000], loss:0.2908 time_taken: 5.230108022689819
Writing the mode at epoch 0
epoch [10/2000], loss:0.0045 time_taken: 31.350057363510132
epoch [20/2000], loss:0.0006 time_taken: 56.38586926460266
epoch [30/2000], loss:0.0003 time_taken: 81.49875998497009
epoch [40/2000], loss:0.0001 time_taken: 106.5484082698822
epoch [50/2000], loss:0.0027 time_taken: 131.61134839057922
epoch [60/2000], loss:0.0002 time_taken: 156.69063663482666
epoch [70/2000], loss:0.0001 time_taken: 181.82100105285645
epoch [80/2000], loss:0.0000 time_taken: 206.96115589141846
epoch [90/2000], loss:0.0000 time_taken: 232.20207691192627
epoch [100/2000], loss:0.0000 time_taken: 259.6808650493622
Writing the mode at epoch 100


KeyboardInterrupt: 

In [96]:
# RESTART
model = torch.load("self_model_action_noaction")

# Analysis of concepts within the triggers
data_input = Variable(torch.from_numpy(np.array(data['plot_array'].values.tolist()).astype(np.float32))).cuda()
model_output = model(data_input)
temp_data = np.concatenate(
    (model.l1d.cpu().detach().numpy(),
     model.b1d.cpu().detach().numpy(),
     model.l2d.cpu().detach().numpy(),
     model.b2d.cpu().detach().numpy(),
     model.l3d.cpu().detach().numpy(),
     model.b3d.cpu().detach().numpy(),
     model.l4d.cpu().detach().numpy(),
     model.b4d.cpu().detach().numpy(),
     model.l5d.cpu().detach().numpy(),
     model.b5d.cpu().detach().numpy()
    ), axis=1
)

temp_data = pd.DataFrame(temp_data)
temp_data = temp_data.fillna(0)
temp_data = round(temp_data, 2)
temp_data.to_csv("activation_data_20211115.csv", index=False)

# Same output, earlier layers
temp_data = np.concatenate(
    (model.l1d.cpu().detach().numpy(),
     model.b1d.cpu().detach().numpy(),
     model.l2d.cpu().detach().numpy(),
     model.b2d.cpu().detach().numpy(),
     model.l3d.cpu().detach().numpy(),
     model.b3d.cpu().detach().numpy()
    ), axis=1
)

temp_data = pd.DataFrame(temp_data)
temp_data = temp_data.fillna(0)
temp_data = round(temp_data, 2)
temp_data.to_csv("activation_data_20211115_earlier_layers.csv", index=False)

  self.smaxd = self.smax(self.b5d)


In [87]:
# Simple Kmeans Clustering
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=20, random_state=42).fit(temp_data.drop(columns=[x for x in temp_data.columns.values if 'cluster' in str(x)]).values)
temp_data['cluster_kmeans'] = kmeans.labels_
print(temp_data['cluster_kmeans'].value_counts())

0     1702
17    1276
8     1093
3      564
18     520
11     485
15     475
2      467
4      460
9      459
14     297
12     260
16     212
13     189
19     188
5      185
7      185
6      176
1       96
10      86
Name: cluster_kmeans, dtype: int64


In [98]:
# We might have to do some mixing and matching not all columns will be useful.  For a group of columns we will try to find the combinations


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1783,1784,1785,1786,1787,1788,1789,1790,1791,cluster_dbscan
0,-0.29,0.03,0.13,0.49,0.13,0.17,-0.06,-0.16,-0.06,-0.44,...,-0.47,0.80,-0.25,-0.25,-0.01,0.15,0.18,-0.38,0.10,-1
1,0.56,-0.53,-0.71,0.09,0.05,-0.56,1.23,0.06,0.20,-0.30,...,-0.11,-0.25,-0.19,-0.59,0.27,0.29,-0.01,0.26,-0.37,-1
2,0.18,-0.23,0.11,0.14,0.32,0.99,0.59,0.12,-0.46,0.41,...,-0.24,0.80,-0.31,-0.58,0.10,0.22,-0.47,0.10,-0.33,-1
3,-0.03,-0.48,0.09,0.04,0.16,0.11,0.08,-0.05,-0.45,0.12,...,-0.29,0.77,-0.16,-0.15,0.08,0.14,-0.12,-0.51,0.24,-1
4,1.11,-0.78,0.63,0.04,-0.30,0.28,0.93,0.12,0.02,-0.10,...,0.01,0.14,-0.23,-0.25,0.01,0.37,0.78,0.21,0.03,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9370,-0.96,0.21,0.14,-0.02,-0.21,-0.32,0.06,-0.45,-0.09,-0.38,...,0.23,-0.44,0.52,0.54,0.71,-0.26,-0.22,0.25,0.62,-1
9371,-1.04,0.25,0.08,-0.02,0.05,-0.00,-0.29,0.08,-0.07,-0.37,...,0.44,0.64,-0.07,-0.30,0.17,0.40,-0.71,-0.32,0.25,-1
9372,-0.30,-0.98,-0.37,0.23,-0.32,-0.05,-0.38,-0.15,0.09,-0.58,...,0.54,-0.48,0.28,0.41,0.46,-0.06,0.54,-0.11,0.72,-1
9373,-0.32,-0.19,0.22,0.14,-0.04,-0.32,0.33,0.09,-0.36,0.14,...,-0.63,0.54,-0.04,-0.22,0.34,-0.01,-0.15,0.10,0.46,-1


In [100]:
from sklearn.cluster import DBSCAN
def perform_dbscan(temp_data, cluster_name)
    clustering = DBSCAN(eps=1, min_samples=10).fit(temp_data.drop(columns=[x for x in temp_data.columns.values if 'cluster' in str(x)]).values)
    temp_data[cluster_name] = clustering.labels_
    return temp_data

-1    9295
 0      80
Name: cluster_dbscan, dtype: int64


In [95]:
temp_data[temp_data['cluster_dbscan']==63]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1784,1785,1786,1787,1788,1789,1790,1791,cluster_kmeans,cluster_dbscan
8768,-0.73,0.5,-0.11,-0.2,0.15,0.18,0.13,-0.27,0.11,-0.24,...,0.63,0.15,0.49,0.25,0.17,-0.18,0.07,-0.8,0,63
9152,-0.73,0.5,-0.11,-0.2,0.15,0.18,0.13,-0.27,0.11,-0.24,...,0.63,0.15,0.49,0.25,0.17,-0.18,0.07,-0.8,0,63
