In [21]:
import pandas as pd
import re
from nltk.parse import DependencyGraph
from pymorphy2 import MorphAnalyzer
morph = MorphAnalyzer()
from statistics import mean
from tqdm import tqdm
import json
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [22]:
from model import Model
model = Model('russian-syntagrus-ud-2.0-170801.udpipe')
def get_conllu(model, text):
    sentences = model.tokenize(text)
    for s in sentences:
        model.tag(s)
        model.parse(s)
    conllu_text = model.write(sentences, "conllu")
    return conllu_text

In [23]:
def get_dep_tree(text):
    trees = []
    for sent in text.split('\n\n'):
        tree = [line for line in sent.split('\n') if line and line[0] != '#']
        trees.append('\n'.join(tree))
    return trees

In [24]:
def parse_sent(text):
    mods_sent = []
    a = get_conllu(model, text)
    ak = get_dep_tree(a)
    for t in ak:
        d = DependencyGraph(t)
        edges = []
        for e in d.nodes:
            for dep in d.nodes[e]['deps']:
                for t in d.nodes[e]['deps'][dep]:
                    edges.append((e, t))
        d.root = d.nodes[0]
        mods_sent.append(list(d.triples()))
    return mods_sent

In [25]:
def lemmatize(text):
    text = text.replace(',', '')
    new = []
    for word in text.split():
        parsed = morph.parse(word)[0].normal_form
        new.append(parsed)
    return ' '.join(new)

In [26]:
with open('collection.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

In [52]:
df = pd.read_csv('names_frames_new4.csv')

In [53]:
df = df.loc[df['frame_intersections'] != '[]']

In [54]:
all_text = []
for val in df.to_numpy():
    one = {'id': val[0], 'text': val[1], 'names': val[2], 'frame': val[5], 'frame_num': val[7]}
    all_text.append(one)

In [55]:
onename_texts = []
for t in all_text:
    if len(t['names'].split(',')) == 1:
        onename_texts.append(t)

In [56]:
left = []
nsubj = {'Стива': [], 'Анна': [], 'Вронский': [], 'Каренин': [], 'Кити': [], 'Левин': [], 'Долли': []}
for para in tqdm(onename_texts):
    frs = para['frame'].replace("['", '')
    frs = frs.replace("'", '')
    frs = frs.replace("]", '')
    frs = frs.split(', ')
    nums = para['frame_num'].replace('[', '')
    nums = nums.replace("'", '')
    nums = nums.replace("]", '')
    nums = nums.split(', ')
    for i, frame in enumerate(frs):
        text = para['text'] 
        sents = re.split('\.|…|\?|!', text)
        for s in sents:
            lemmas = lemmatize(s)
            if frame in lemmas:
                parsed = parse_sent(s)
                for d in parsed:
                    for dep in d:
                        if frame == morph.parse(str(dep[0][0]))[0].normal_form:
                            if dep[1] == 'nsubj':
                            #    print(dep[2][0])
                                if dep[2][0] in ['Степан', 'Облонский', 'Стив', 'Стива', 'Аркадьич']:
                                    #print(dep, frame)
                                    nsubj['Стива'].append({'id': para['id'], 'FRAME_NUM': nums[i]})
                                elif dep[2][0] in ['Анна', 'Аркадьевна', 'Каренина']:
                                    nsubj['Анна'].append({'id': para['id'], 'FRAME_NUM': nums[i]})
                                elif dep[2][0] in ['Левин', 'Костя', 'Константин']:
                                    nsubj['Левин'].append({'id': para['id'], 'FRAME_NUM': nums[i]})
                                elif dep[2][0] in ['Кити', 'Кить', 'Щербацкая', 'Екатерина']:
                                    nsubj['Кити'].append({'id': para['id'], 'FRAME_NUM': nums[i]})
                                elif dep[2][0] in ['Долли', 'Дарья', 'Александровна', 'Облонская']:
                                    nsubj['Долли'].append({'id': para['id'], 'FRAME_NUM': nums[i]})
                                elif dep[2][0] in ['Вронский', 'Кириллович']:
                                    nsubj['Вронский'].append({'id': para['id'], 'FRAME_NUM': nums[i]})
                                elif dep[2][0] in ['Каренин', 'Александрович', 'Алексей']:
                                    nsubj['Каренин'].append({'id': para['id'], 'FRAME_NUM': nums[i]})
            elif len(frame) > 1:
                if frame.split()[0] in lemmas:
                    parsed = parse_sent(s)
                    for d in parsed:
                        for dep in d:
                            if frame.split()[0] == morph.parse(str(dep[0][0]))[0].normal_form:
                                na = morph.parse(str(dep[2][0]))[0].normal_form
                                if na in ['Степан', 'Облонский', 'Стив', 'Стива', 'Аркадьич']:
                                    #print(dep, frame)
                                    nsubj['Стива'].append({'id': para['id'], 'FRAME_NUM': nums[i]})
                                elif na in ['Анна', 'Аркадьевна', 'Каренина']:
                                    nsubj['Анна'].append({'id': para['id'], 'FRAME_NUM': nums[i]})
                                elif na in ['Левин', 'Костя', 'Константин']:
                                    nsubj['Левин'].append({'id': para['id'], 'FRAME_NUM': nums[i]})
                                elif na in ['Кити', 'Кить', 'Щербацкая', 'Екатерина']:
                                    nsubj['Кити'].append({'id': para['id'], 'FRAME_NUM': nums[i]})
                                elif na in ['Долли', 'Дарья', 'Александровна', 'Облонская']:
                                    nsubj['Долли'].append({'id': para['id'], 'FRAME_NUM': nums[i]})
                                elif na in ['Вронский', 'Кириллович']:
                                    nsubj['Вронский'].append({'id': para['id'], 'FRAME_NUM': nums[i]})
                                elif na in ['Каренин', 'Александрович', 'Алексей']:
                                    nsubj['Каренин'].append({'id': para['id'], 'FRAME_NUM': nums[i]})


The graph doesn't contain a node that depends on the root element.

100%|██████████████████████████████████████████████████████████████████████████████| 1752/1752 [04:02<00:00,  7.21it/s]


In [33]:
list_names = ['Степан', 'Облонский', 'Стив', 'Стива', 'Аркадьич', 'Анна', 'Аркадьевна', 'Каренина', 'Левин', 'Костя', 'Константин', 
             'Кити', 'Кить', 'Щербацкая', 'Екатерина', 'Долли', 'Дарья', 'Александровна', 'Облонская', 'Вронский', 'Кириллович', 
             'Каренин', 'Александрович', 'Алексей']

In [60]:
new_nsubj = {}
for name in nsubj.keys():
    new_nsubj[name] = {'i': [], 'ii': [], 'iii': [], 'iv': [], 'v': [], 'vi': [], 'vii': [], 'viii': []} 
    for f in nsubj[name]:
        i = f['id']
        if i < 1170:
            new_nsubj[name]['i'].append(f['FRAME_NUM'])
        elif i < 2409:
            new_nsubj[name]['ii'].append(f['FRAME_NUM'])
        elif i < 3323:
            new_nsubj[name]['iii'].append(f['FRAME_NUM'])
        elif i < 4152:
            new_nsubj[name]['iv'].append(f['FRAME_NUM'])
        elif i < 5121:
            new_nsubj[name]['v'].append(f['FRAME_NUM'])
        elif i < 6318:
            new_nsubj[name]['vi'].append(f['FRAME_NUM'])
        elif i < 7265:
            new_nsubj[name]['vii'].append(f['FRAME_NUM'])
        else:
            new_nsubj[name]['viii'].append(f['FRAME_NUM'])

In [61]:
cs = {'Стива': {}, 'Анна': {}, 'Вронский': {}, 'Каренин': {}, 'Кити': {}, 'Левин': {}, 'Долли': {}}
for char, part in new_nsubj.items():
    pers_x = []
    pers_y = []
    parts = {'i': 1, 'ii': 2, 'iii': 3, 'iv': 4, 'v': 5, 'vi': 6, 'vii': 7, 'viii': 8}
   # print(part)
    for elem, my_list in part.items():
        num_part = parts[elem]
        this_part = []
        for num in my_list:
            ef_st = 0
            all_ef = data[num]['frames']
            if 'effect' in all_ef.keys():
                a = all_ef['effect'][0]
                if a[1] == '+':
                    ef_st = float(a[2])
                else:
                    ef_st = float(str(a[1]) + str(a[2]))
            elif 'state' in all_ef.keys():
                a = all_ef['state'][0]
                if a[1] == 'pos':
                    ef_st = float(a[2])
                else:
                    ef_st = float('-' + str(a[2]))
            if ef_st != 0:
                this_part.append(ef_st)
        if len(this_part) > 1:
            pers_y.append(mean(this_part))
            pers_x.append(num_part)
    cs[char] = {'x': pers_x, 'y': pers_y}

In [68]:
whole_x = []
whole_y = []
name = []
for n, result in cs.items():
    for x in result['x']:
        whole_x.append(x)
        name.append(n)
    for y in result['y']:
        whole_y.append(y)

In [66]:
#3
fig = make_subplots(rows=4, cols=2,
                    shared_xaxes='all',
                    shared_yaxes='all',
                    subplot_titles = ['Стива', 'Левин', 'Долли', 'Кити', 'Анна', 'Каренин', 'Вронский'])

fig.add_trace(
    go.Scatter(x=cs['Стива']['x'], y=cs['Стива']['y'], mode='markers', text=cs['Стива']['y'], textposition='top right'),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=cs['Долли']['x'], y=cs['Долли']['y'], mode='markers'),
    row=2, col=1
)

fig.add_trace(
    go.Scatter(x=cs['Анна']['x'], y=cs['Анна']['y'], mode='markers'),
    row=3, col=1
)

fig.add_trace(
    go.Scatter(x=cs['Вронский']['x'], y=cs['Вронский']['y'], mode='markers'),
    row=4, col=1
)

fig.add_trace(
    go.Scatter(x=cs['Левин']['x'], y=cs['Левин']['y'], mode='markers'),
    row=1, col=2
)

fig.add_trace(
    go.Scatter(x=cs['Кити']['x'], y=cs['Кити']['y'], mode='markers'),
    row=2, col=2
)

fig.add_trace(
    go.Scatter(x=cs['Каренин']['x'], y=cs['Каренин']['y'], mode='markers'),
    row=3, col=2
)

fig.update_layout(height=600, width=800, title_text="Состояния героев романа исходя из фреймов с одним именем")
fig.show()

In [69]:
#3
import plotly.express as px
fig = px.line(x=whole_x, y=whole_y, color=name)
fig.show()

план
1. перебираем фреймы
2. если в ролях хотя бы одно имя, сохраняем
3. глазами просматриваем второго участника
4. если два имени, сразу сохраняем все нужные данные
5. если второе - не имя, то смотрим по ситуации
6. потом уже анализируем падежи в словаре

остановилась на 0_76

In [57]:
with open('roles.json', 'r', encoding='utf-8') as r:
    roles = json.load(r)
    roles = roles[0]

In [58]:
#переходим к части страданий
attits = {}
for t in tqdm(all_text):
    if len(t['names'].split(',')) > 1:
        text = t['text']
        frs = t['frame'].replace("['", '')
        frs = frs.replace("'", '')
        frs = frs.replace("]", '')
        frs = frs.split(', ')
        nums = t['frame_num'].replace('[', '')
        nums = nums.replace("'", '')
        nums = nums.replace("]", '')
        nums = nums.split(', ')
        for i, frame in enumerate(frs):
            framed = []
         #   print(text)
            num = nums[i]
            if 'polarity' in data[num]['frames'].keys():
                animacies = 0
                for role in roles[num].values():
                    if role[0] == 'ANIM':
                        animacies += 1
                if animacies > 1:
                  #  print(frame)
                    parsed = parse_sent(text)
                 #   print(text)
                    for d in parsed:
                        roots = []
                        for dep in d:
                            if frame == morph.parse(str(dep[0][0]))[0].normal_form or frame.split()[0] == morph.parse(str(dep[0][0]))[0].normal_form:
                                if dep[1] == 'nsubj' or dep[1] == 'obj':
                                    roots.append(dep[0])
                        for r in set(roots):
                            for dep in d:
                                if dep[0] == r:
                                    if dep[1] == 'nsubj' and dep[2][1] == 'PROPN':
                                        framed.append(dep)
                                    elif dep[1] == 'obj' and dep[2][1] == 'PROPN':
                                        framed.append(dep)
                                    elif dep[1] == 'obl' and dep[2][1] == 'PROPN':
                                        framed.append(dep)
                        for dep in d:
                            if frame == morph.parse(str(dep[0][0]))[0].normal_form or frame.split()[0] == morph.parse(str(dep[0][0]))[0].normal_form:
                                if dep[1] == 'nsubj':
                                    name = dep[2][0]
                                    if dep[2][1] == 'PROPN':
                                        na = morph.parse(name)[0].normal_form
                                        na = na.title()
                                        if na in ['Степан', 'Облонский', 'Стив', 'Стива', 'Аркадьич']:
                                            nsubj['Стива'].append({'id': t['id'], 'FRAME_NUM': num})
                                        elif na in ['Анна', 'Аркадьевна', 'Каренина']:
                                            nsubj['Анна'].append({'id': t['id'], 'FRAME_NUM': num})
                                        elif na in ['Левин', 'Костя', 'Константин']:
                                            nsubj['Левин'].append({'id': t['id'], 'FRAME_NUM': num})
                                        elif na in ['Кити', 'Кить', 'Щербацкая', 'Екатерина']:
                                            nsubj['Кити'].append({'id': t['id'], 'FRAME_NUM': num})
                                        elif na in ['Долли', 'Дарья', 'Александровна', 'Облонская']:
                                            nsubj['Долли'].append({'id': t['id'], 'FRAME_NUM': num})
                                        elif na in ['Вронский', 'Кириллович']:
                                            nsubj['Вронский'].append({'id': t['id'], 'FRAME_NUM': num})
                                        elif na in ['Каренин', 'Александрович', 'Алексей']:
                                            nsubj['Каренин'].append({'id': t['id'], 'FRAME_NUM': num})
                                        
                elif animacies == 1:
                    for d in parsed:
                        for dep in d:
                            if frame == morph.parse(str(dep[0][0]))[0].normal_form or frame.split()[0] == morph.parse(str(dep[0][0]))[0].normal_form:
                                if dep[1] == 'nsubj':
                                    name = dep[2][0]
                                    if dep[2][1] == 'PROPN':
                                        na = morph.parse(name)[0].normal_form
                                        na = na.title()
                                        if na in ['Степан', 'Облонский', 'Стив', 'Стива', 'Аркадьич']:
                                            nsubj['Стива'].append({'id': t['id'], 'FRAME_NUM': num})
                                        elif na in ['Анна', 'Аркадьевна', 'Каренина']:
                                            nsubj['Анна'].append({'id': t['id'], 'FRAME_NUM': num})
                                        elif na in ['Левин', 'Костя', 'Константин']:
                                            nsubj['Левин'].append({'id': t['id'], 'FRAME_NUM': num})
                                        elif na in ['Кити', 'Кить', 'Щербацкая', 'Екатерина']:
                                            nsubj['Кити'].append({'id': t['id'], 'FRAME_NUM': num})
                                        elif na in ['Долли', 'Дарья', 'Александровна', 'Облонская']:
                                            nsubj['Долли'].append({'id': t['id'], 'FRAME_NUM': num})
                                        elif na in ['Вронский', 'Кириллович']:
                                            nsubj['Вронский'].append({'id': t['id'], 'FRAME_NUM': num})
                                        elif na in ['Каренин', 'Александрович', 'Алексей']:
                                            nsubj['Каренин'].append({'id': t['id'], 'FRAME_NUM': num})
                                        
                if len(framed) > 1:
                    this_atts = {}
                    for attitude in data[num]['frames']['polarity']:
                        if 'author' not in attitude:
                            pers1 = attitude[0]
                            pers2 = attitude[1]
                            if roles[num][pers1][0] == 'ANIM' and roles[num][pers2][0] == 'ANIM':
                                chars = [morph.parse(str(a[2][0]))[0].normal_form for a in framed]
                                if len(set(chars)) > 1:
                                    if framed[0][1] != 'nsubj':
                                        framed = list(reversed(framed))
                                    if this_atts == {}:
                                        for i, p in enumerate(attitude[:2]):
                                            this_atts[p] = framed[i][2][0]
                                    if attitude[0] in this_atts.keys() and attitude[1] in this_atts.keys():
                                        first = morph.parse(str(this_atts[attitude[0]]))[0].normal_form 
                                        if first not in attits.keys():
                                            attits[first] = {}
                                        second = morph.parse(str(this_atts[attitude[1]]))[0].normal_form
                                        if second not in attits[first].keys():
                                            attits[first][second] = []
                                        attits[first][second].append(attitude[-2:])
                                        
                            
                        
                                        
                
                 #тут добавляем к стейт_эффект       

100%|██████████████████████████████████████████████████████████████████████████████| 2300/2300 [03:57<00:00,  9.69it/s]


In [45]:
attits

{'левин': {'облонской': [['pos', 1.0]],
  'анна': [['neg', 1.0]],
  'кить': [['neg', 1.0], ['pos', 0.7], ['pos', 0.7], ['pos', 1.0]],
  'весловский': [['pos', 1.0]],
  'степан': [['pos', 0.7]],
  'москва': [['pos', 1.0]]},
 'долли': {'анна': [['pos', 1.0]],
  'левин': [['pos', 0.7]],
  'кить': [['neg', 1.0]]},
 'анна': {'левин': [['neg', 1.0]],
  'вронский': [['pos', 1.0], ['pos', 1.0], ['neg', 1.0]],
  'яшвино': [['neg', 1.0]],
  'тушкевич': [['pos', 1.0]]},
 'вронский': {'анна': [['pos', 1.0], ['neg', 1.0], ['pos', 0.7]],
  'кить': [['neg', 1.0]],
  'анне': [['neg', 1.0]],
  'лидия': [['neg', 0.7]]},
 'кить': {'левин': [['neg', 1.0], ['pos', 1.0], ['pos', 1.0], ['neg', 1.0]],
  'вронский': [['neg', 1.0]],
  'щербацкий': [['neg', 1.0]],
  'долли': [['neg', 1.0]],
  'аркадьич': [['pos', 0.7]]},
 'анне': {'вронский': [['neg', 1.0]], 'алексей': [['neg', 1.0]]},
 'москва': {'депутация': [['neg', 1.0]], 'левин': [['pos', 0.7]]},
 'депутация': {'москва': [['neg', 1.0]]},
 'алексей': {'алекс

In [47]:
#делаем отдельную таблицу по персонажам за всю книгу
ccs = {'Стива': {}, 'Анна': {}, 'Вронский': {}, 'Каренин': {}, 'Кити': {}, 'Левин': {}, 'Долли': {}}
for char, part in nsubj.items():
    states = []
    for state in part:
        num = state['FRAME_NUM']
        ef_st = 0
        all_ef = data[num]['frames']
   #     print(all_ef)
        if 'effect' in all_ef.keys():
            a = all_ef['effect'][0]
            if a[1] == '+':
                ef_st = float(a[2])
            else:
                ef_st = float(str(a[1]) + str(a[2]))
        elif 'state' in all_ef.keys():
            a = all_ef['state'][0]
            if a[1] == 'pos':
                ef_st = float(a[2])
            else:
                ef_st = float('-' + str(a[2]))
        if ef_st != 0:
            states.append(ef_st)
    ccs[char] = {'количество': len(states), 'state': mean(states)}