# Проект 693: 
# Извлечение графовых представлений сценариев фильмов и их анализ

## Аннотация

Предполагается построение динамического графа сюжета с выявлением персонажей, выявление типа и тональности отношений между персонажами. Базовая задача - сравнение графов и их кластеризация, как задача анализа построенных данных. Также предполагается исследовать подходы к генерации новых сюжетов, построение архитектуры нейросети для генерации динамических атрибутивных графов сюжетов с применением GANов на графах.

## Задачи

* **Собрать датасет по динамическим графам, описывающим эволюцию сюжетов**
* **Построить эффективное векторное представление, позволяющее сравнивать структурную и атрибутивную схожесть сюжетов в графовом представлении**
* Провести анализ кластеризации сюжетов
* Рассмотреть возможности генерации графового представления сюжетов и оценки качества использования на практике подобной модели

### Собрать персонажей

Важно:

* пол (либо брать словарь имен, либо реализовывать, где-то такое было)

* выделение перво- и второстепенных персонажей (обычно топ-5 персонажей)

### Графы

Вершины двух типов:
* локации
* персонажи

In [None]:
# так как использую колаб, надо дать доступ к гугл-диску
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import json
import os
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from sklearn.cluster import KMeans
import json
import re
from collections import Counter

In [None]:
'''
!ls /content/drive/My\ Drive/Colab\ Notebooks/*.py
'''

'\n!ls /content/drive/My\\ Drive/Colab\\ Notebooks/*.py\n'

In [None]:
'''
!cat '/content/drive/My Drive/Colab Notebooks/json_helper.py'
'''

"\n!cat '/content/drive/My Drive/Colab Notebooks/json_helper.py'\n"

In [None]:
'''
import sys
sys.path.append('/content/drive/My Drive')
'''

"\nimport sys\nsys.path.append('/content/drive/My Drive')\n"

In [None]:
'''
!python3 "/content/drive/My Drive/Colab Notebooks/json_helper.py"
'''

'\n!python3 "/content/drive/My Drive/Colab Notebooks/json_helper.py"\n'

In [None]:
'''
import json_helper as jhlp
'''

'\nimport json_helper as jhlp\n'

In [None]:
from google.colab import files
src = list(files.upload().values())[0]
open('/content/drive/My Drive/Colab Notebooks/json_helper.py','wb').write(src)
import json_helper

Saving json_helper.py to json_helper (1).py


In [None]:
import json_helper as jhlp

## Персонажи (пробный вариант для одного сценария)

### Работа со сценарием

парсер

``` parser.py ```

In [None]:
'''
import json
import re

def script_by_scenes(path_to_script):
    
    fp=open(path_to_script, 'r')
    script=fp.readlines()
    
    script=[re.sub('<.*?>', '', s).rstrip('\n').rstrip('\r').strip() for s in script]
    
    def extra_conditions(string):
        if( (string.find('Cont') or string.find('cont') or string.find('CONTINUED') or string.find("CONT'D")) != -1):
            flag=0
        elif(string[0] == '(' != -1):
            flag=0
        else:
            flag=1
        return flag
    
    portions=list()
    portion_id=0
    portion_text=list()
    
    for line_id, line in enumerate(script):
        portion_text.append(line)
        
        if(line_id == len(script)-1):
            portions.append((portion_id, portion_text))
            portion_text=list()
            portion_id += 1
            break
        
        tmp_line=''.join(e for e in script[line_id+1] if e.isalpha())

        if(tmp_line.isupper() and extra_conditions(script[line_id+1])):
            portions.append((portion_id, portion_text))
            portion_text=list()
            portion_id += 1

    portions=[p for p in portions if p[1][0]!='']
    
    characters_tmp=[tmp[1][0] for tmp in portions]
    characters_tmp=[re.sub('\(.*?\)', '', tmp).strip() for tmp in characters_tmp]
    from collections import Counter
    characters_tmp=zip(Counter(characters_tmp).keys(), Counter(characters_tmp).values())
    characters=[tmp[0] for tmp in characters_tmp if tmp[1] > 10 and (tmp[0].find('CUT') == -1)]
    scene_names=[tmp[0] for tmp in characters_tmp if tmp[0] not in characters]
    
    scenes=list()
    scene_id=1
    scene=list()
    
    for portion in portions:
        scene.append(portion[1])
        if(portion[0] != len(portions)):
            next_name = re.sub('\(.*?\)', '', portions[portion[0]][1][0])
        else:
            break
        if next_name in scene_names:
            scenes.append((scene_id, scene))
            scene_id += 1
            scene=list()
            
    return scenes, characters, scene_names
    
def parse_data(path_to_script):  
  
    scenes, list_of_chars, list_of_scenes =script_by_scenes(path_to_script)
    all_scenes=dict()
    cntr=0
    
    for scene_content in scenes:
    
        scene = dict()
        scene_formatted = dict()
        scene_desc_list = list()
        char_dialogues = list()
        
        scene_part_id=1
      
        for scene_part in scene_content[1]:        
            
            if scene_part_id == 1:
                scene_part_type = 'SCENE_DESC'
                scene_part_name = scene_part[0]
                scene_part_content = ""
                for sp in scene_part[1:]:
                    scene_part_content += sp + " " 
                scene[scene_part_id] = {scene_part_name : (scene_part_type, scene_part_content)} 
                scene_part_id += 1
                continue    
            
            if re.sub('\(.*?\)', '', scene_part[0]).strip() in list_of_chars:
                scene_part_type = 'DIALOGUE'
                
            else:
                #print "Scene_Part_Type should only be dialogues of chars: %s" % scene_part[0]
                break
                
            scene_part_name = re.sub('\(.*?\)', '', scene_part[0]).strip()
            dialogue = ""       
            
            flag=1
            index=-1
            for idx,sp in enumerate(scene_part[1:]):
                
                if sp == '':
                    index = idx +1
                    break
                if sp[0] == '(' and sp.find(')') != -1:
                    flag = 1
                    dialogue += sp + '\n'
                    continue
                elif sp[0] == '(' and sp.find(')') == -1:
                    flag = 0
                    dialogue += sp + ' '
                    continue
                if flag == 0 and sp.find(')') != -1:
                    flag = 1
                    dialogue += sp + '\n'
                    continue
                if flag == 0 and sp.find(')') == -1:
                    flag = 0
                    dialogue += sp + ' '
                    continue
                
                dialogue += sp + ' '
            
            scene[scene_part_id] = {scene_part_name : (scene_part_type, dialogue)} 
            scene_part_id += 1
                   
                   
            if index+1 >= len(scene_part):
                break
    
            desc_following_dialogue = "" 
            scene_part_type = "SCENE_DESC_FOLLOWING_DIALOGUE"
            scene_part_name = re.sub('\(.*?\)', '', scene_part[0]).strip()
    
            for idx,sp in enumerate(scene_part[index+1:]):
                desc_following_dialogue += sp + " "
            
            scene[scene_part_id] = {scene_part_name : (scene_part_type, desc_following_dialogue)} 
            scene_part_id += 1
            
        
        for k,v in scene.items():
            if v.values()[0][0] == 'SCENE_DESC_FOLLOWING_DIALOGUE' or v.values()[0][0] == 'SCENE_DESC':
                scene_desc_list.append([k, v.values()[0][1]])
            elif v.values()[0][0] == 'DIALOGUE':
                char_dialogues.append([k, v.keys()[0], v.values()[0][1]])
    
        scene_formatted['scene_descriptons_list'] = scene_desc_list
        scene_formatted['char_dialogues'] = char_dialogues
        if len(char_dialogues) > 0:
            cntr += 1
        all_scenes[scene_content[0]] = scene_formatted
    
    print("All Scenes Processed")
    
    if len(all_scenes) == 0:
        return all_scenes, -1
        
    
    print(cntr*1.0/len(all_scenes))
    
    return all_scenes, cntr*1.0/len(all_scenes)

import os
scripts=os.listdir('/content/drive/My Drive/Colab Notebooks/json')

try: 
    os.makedirs('/content/drive/My Drive/Colab Notebooks/json')
except OSError:
    if not os.path.isdir('/content/drive/My Drive/Colab Notebooks/json'):
        raise

for cntr,script in enumerate(scripts):
    print("Parsing Script %s : %d/%d" % (script.strip('.txt'), cntr+1, len(scripts)))
    processed_scenes, flag = parse_data('/content/drive/My Drive/Colab Notebooks/json/%s' % script)
    if flag >= 0.75:
        with open('/content/drive/My Drive/Colab Notebooks/json/%s.json' % script.strip('.txt'), 'w') as fp:        
            json.dump(processed_scenes, fp)

'''

'\nimport json\nimport re\n\ndef script_by_scenes(path_to_script):\n    \n    fp=open(path_to_script, \'r\')\n    script=fp.readlines()\n    \n    script=[re.sub(\'<.*?>\', \'\', s).rstrip(\'\n\').rstrip(\'\r\').strip() for s in script]\n    \n    def extra_conditions(string):\n        if( (string.find(\'Cont\') or string.find(\'cont\') or string.find(\'CONTINUED\') or string.find("CONT\'D")) != -1):\n            flag=0\n        elif(string[0] == \'(\' != -1):\n            flag=0\n        else:\n            flag=1\n        return flag\n    \n    portions=list()\n    portion_id=0\n    portion_text=list()\n    \n    for line_id, line in enumerate(script):\n        portion_text.append(line)\n        \n        if(line_id == len(script)-1):\n            portions.append((portion_id, portion_text))\n            portion_text=list()\n            portion_id += 1\n            break\n        \n        tmp_line=\'\'.join(e for e in script[line_id+1] if e.isalpha())\n\n        if(tmp_line.isupper() 

``` chars_scenes.py ```

In [None]:
'''
import json
import re
from collections import Counter

def script_by_scenes(path_to_script):
    
    fp=open(path_to_script, 'r')
    script=fp.readlines()
    script=[re.sub("[\(\[].*?[\)\]]", "", re.sub('<.*?>', '', s).rstrip('\n').rstrip('\r').strip()).rstrip('\n').rstrip('\r').strip() for s in script]
    scene_break=list()
    scene_break=[l for l in script if l.isupper()]
    
    list_of_chars=dict(Counter(scene_break))
    list_of_chars=[k for k,v in list_of_chars.items() if (v>=5 and (k.find('CUT') == -1 and k.find('INT.') == -1 and k.find('EXT.') == -1 ))]
    scenes = dict()
    cntr=1
    scenes[1]=list()
    for l in scene_break:
        if l in list_of_chars:
            scenes[cntr].append(l)
        else:
            if len(scenes[cntr])!=0:
                cntr += 1
                scenes[cntr]=list()
    if len(scenes[cntr])==0:
        del scenes[cntr]
            
    return scenes
   

import os
scripts=os.listdir('...')

try: 
    os.makedirs('/content/drive/My Drive/Colab Notebooks/json')
except OSError:
    if not os.path.isdir('/content/drive/My Drive/Colab Notebooks/json'):
        raise

for cntr,script in enumerate(scripts):
    print("Parsing Script %s : %d/%d" % (script.strip('.txt'), cntr+1, len(scripts)))
    with open('/content/drive/My Drive/Colab Notebooks/json/%s.json' % script.strip('.txt'), 'w') as fp:        
        json.dump(script_by_scenes('.../%s'% script), fp)
'''

'\nimport json\nimport re\nfrom collections import Counter\n\ndef script_by_scenes(path_to_script):\n    \n    fp=open(path_to_script, \'r\')\n    script=fp.readlines()\n    script=[re.sub("[\\(\\[].*?[\\)\\]]", "", re.sub(\'<.*?>\', \'\', s).rstrip(\'\n\').rstrip(\'\r\').strip()).rstrip(\'\n\').rstrip(\'\r\').strip() for s in script]\n    scene_break=list()\n    scene_break=[l for l in script if l.isupper()]\n    \n    list_of_chars=dict(Counter(scene_break))\n    list_of_chars=[k for k,v in list_of_chars.items() if (v>=5 and (k.find(\'CUT\') == -1 and k.find(\'INT.\') == -1 and k.find(\'EXT.\') == -1 ))]\n    scenes = dict()\n    cntr=1\n    scenes[1]=list()\n    for l in scene_break:\n        if l in list_of_chars:\n            scenes[cntr].append(l)\n        else:\n            if len(scenes[cntr])!=0:\n                cntr += 1\n                scenes[cntr]=list()\n    if len(scenes[cntr])==0:\n        del scenes[cntr]\n            \n    return scenes\n   \n\nimport os\nscripts=os.

Персонажи

In [None]:
json_path = "/content/drive/My Drive/Colab Notebooks/json"
movie_jsons = [json_path+'/' + json_name for json_name in os.listdir(json_path)]
script_dicts = []
for movie_json in movie_jsons:
    with open(movie_json) as f:
        script_dicts.append(json.loads(f.read()))

UnicodeDecodeError: ignored

In [None]:
#find all characters present in a scene from scene descriptions and dialogues
def get_chars_for_scene(scene, all_char_list):
    chars_from_desc = set()
    chars_from_dialogue = set()
    desc = jhlp.get_description_for_scene(scene)
    for c in all_char_list:
        if c in desc:
            chars_from_desc.add(c)
    for d in scene['char_dialogues']:
        if d[1] in all_char_list:
            chars_from_dialogue.add(d[1])
    return list(chars_from_desc), list(chars_from_dialogue)

In [None]:
#graph formation method I
#a scene interaction is registered if two characters co-occur
#a better approach is used later
#this gives great results too

graphs_I = []
for script_dict in script_dicts:
    all_char_list = jhlp.get_all_char_from_script(script_dict)
    graph = nx.Graph()
    for key in script_dict:
        scene = script_dict[key]
        a, b = get_chars_for_scene(scene, all_char_list)
        for char1 in b:
            for char2 in b:
                if char1 == char2:
                    continue
                if not graph.has_edge(char1, char2):
                    graph.add_edge(char1, char2, weight=1)
                else:
                    graph[char1][char2]['weight'] += 1
    graphs_I.append(graph)

In [None]:
plt.style.use('ggplot')

In [None]:
movies = ['Inception', 'Gravity', 'Gladiator']
json_path = "/content/drive/My Drive/Colab Notebooks/json"
movie_jsons = [json_path+'/'+movie+'.json' for movie in movies]
script_dicts = []
for movie_json in movie_jsons:
    with open(movie_json) as f:
        script_dicts.append(json.loads(f.read()))

In [None]:
#find character interactions for a scene
def get_chars_for_scene(scene, all_char_list):
    chars_from_desc = set()
    chars_from_dialogue = set()
    desc = jhlp.get_description_for_scene(scene)
    for c in all_char_list:
        if c in desc:
            chars_from_desc.add(c)
    for d in scene['char_dialogues']:
        if d[1] in all_char_list:
            chars_from_dialogue.add(d[1])
    return list(chars_from_desc), list(chars_from_dialogue)

In [None]:
#graph formation method I
graphs = []
for script_dict in script_dicts:
    all_char_list = jhlp.get_all_char_from_script(script_dict)
    graph = nx.Graph()
    for key in script_dict:
        scene = script_dict[key]
        a, b = get_chars_for_scene(scene, all_char_list)
        for char1 in b:
            for char2 in b:
                if char1 == char2:
                    continue
                if not graph.has_edge(char1, char2):
                    graph.add_edge(char1, char2, weight=1)
                else:
                    graph[char1][char2]['weight'] += 1
    graphs.append(graph)

TypeError: ignored

In [None]:
for graph in graphs:
    edge_weights = [graph[u][v]['weight'] for u,v in graph.edges()]
    pos = nx.spectral_layout(graph)
    plt.figure(figsize=(10,10))
    nx.draw_networkx(graph, with_labels=True, width=edge_weights, alpha=0.5); plt.show()

NameError: ignored