In [150]:
#!pip install contextualized-topic-models
#!pip install pyldavis
from transformers import pipeline
from os import listdir
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import numpy as np
from contextualized_topic_models.models.ctm import ZeroShotTM, CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessing
import nltk
import string

In [80]:
path_screenplays_scenes='coref/csi-corpus/screenplay_summarization/scene_level_n_aspects'
eps=[]


for ep in listdir(path_screenplays_scenes):
    annotated_scenes=pd.read_csv(path_screenplays_scenes+'/'+ep)
    eps.append(ep.split('.csv')[0])
    
    
print(eps)

['s05e10', 's03e03', 's01e19', 's01e23', 's02e15', 's02e01', 's05e22', 's02e09', 's05e21', 's04e23', 's04e09', 's05e12', 's03e12', 's05e03', 's04e22', 's03e21', 's02e04', 's03e19', 's04e14', 's04e06', 's01e07', 's04e21', 's05e17', 's03e08', 's04e10', 's04e05', 's05e05', 's05e06', 's01e20', 's04e15', 's02e06', 's01e08', 's04e12', 's05e08', 's03e05', 's02e10', 's05e13', 's03e11', 's01e13']


In [14]:
nltk.download('stopwords')
tp = TopicModelDataPreparation("paraphrase-distilroberta-base-v1")


[nltk_data] Downloading package stopwords to /home/reboud/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [116]:
documents=[]
for ep in eps:
    annotated_scenes=pd.read_csv(path_screenplays_scenes+'/'+ep+'.csv')
    
    # This paragraph separates dialogue and non dialogue
    dialogue_matches=annotated_scenes.scene_text.str.extractall(r"(?P<dialogue_matches>\[\[.*?\"\,)")
    annotated_scenes['dialogue']=annotated_scenes.scene_text.str.extract(r"(?P<dialogue>\[\[.*?\"\,)")
    annotated_scenes['non_dialogue']=annotated_scenes.scene_text.str.replace(r"(?P<non_dialogue>\[\[.*?\"\,)",'')
    annotated_scenes['dialogue']=annotated_scenes.dialogue.astype(str).str.replace('[^\w\s]','')
    annotated_scenes.non_dialogue = annotated_scenes.non_dialogue.str.replace('"', '')
    dialogue_matches.dialogue=dialogue_matches.dialogue_matches.groupby(level=0).apply(list)
    annotated_scenes.loc[annotated_scenes['dialogue'].notna(), 'dialogue'] = dialogue_matches.dialogue
    annotated_scenes.loc[annotated_scenes['dialogue'].isna(), 'dialogue'] = 'no dialogue'
    annotated_scenes.loc[annotated_scenes['non_dialogue'].isna(), 'non_dialogue'] = 'only dialogue'
    sentences_list=annotated_scenes['scene_text'].tolist()
    #print(sentences_list)
    episodes=''.join(sentences_list)
    episodes=episodes.translate(str.maketrans('','',string.punctuation))
    episodes=episodes.replace("  ", " ")
    episodes=episodes.replace("   ", " ")
    episodes=episodes.replace("  ", " ")
    #print(episodes)
    #print(annotated_scenes[])
    documents.append(episodes)
    #documents.extend(annotated_scenes['non_dialogue'])
    
    

    


  annotated_scenes['dialogue']=annotated_scenes.dialogue.astype(str).str.replace('[^\w\s]','')
  # This is added back by InteractiveShellApp.init_path()


In [156]:

sp = WhiteSpacePreprocessing(documents, stopwords_language='english')
preprocessed_documents, unpreprocessed_corpus, vocab = sp.preprocess()
#print(preprocessed_documents[:2])
training_dataset = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)
ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=10, num_epochs=20)
ctm.fit(training_dataset) 
    

I0804 17:01:00.791814 139824218896192 SentenceTransformer.py:41] Load pretrained SentenceTransformer: paraphrase-distilroberta-base-v1
I0804 17:01:00.792692 139824218896192 SentenceTransformer.py:45] Did not find folder paraphrase-distilroberta-base-v1
I0804 17:01:00.793036 139824218896192 SentenceTransformer.py:51] Search model on server: http://sbert.net/models/paraphrase-distilroberta-base-v1.zip
I0804 17:01:00.793786 139824218896192 SentenceTransformer.py:107] Load SentenceTransformer from folder: /home/reboud/.cache/torch/sentence_transformers/sbert.net_models_paraphrase-distilroberta-base-v1
I0804 17:01:02.539580 139824218896192 SentenceTransformer.py:131] Use pytorch device: cpu


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch: [20/20]	 Seen Samples: [780/780]	Train Loss: 30018.79487179487	Time: 0:00:00.412341: : 20it [00:08,  2.42it/s] 


In [157]:
topics_predictions = ctm.get_thetas(training_dataset, n_samples=5)

Sampling: [5/5]: : 5it [00:01,  2.53it/s]


In [158]:
import numpy as np
topic_number = np.argmax(topics_predictions[13])
ctm.get_topic_lists(5)[topic_number]

['anderson', 'sybilperez', 'gwen', 'dralbertrobbins', 'alicia']

In [159]:
lda_vis_data = ctm.get_ldavis_data_format(tp.vocab, training_dataset,39)

Sampling: [39/39]: : 39it [00:15,  2.56it/s]


In [160]:
import pyLDAvis as vis
movies_pd = vis.prepare(**lda_vis_data)
vis.display(movies_pd)