In [29]:
import pandas as pd
import xml.etree.ElementTree as ET
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MultiLabelBinarizer

import regex as re

In [2]:
data = open('annot.opcorpora.no_ambig_strict.xml')

In [3]:
with open('annot.opcorpora.no_ambig_strict.xml', 'rb') as f:
    xml_data = f.read()

In [4]:
root = ET.fromstring(xml_data)

data = []

for sentence in root.findall('.//sentence'):
    sentence_id = sentence.get("id")
    source_text = sentence.find("source").text
    
    for token in sentence.findall('.//token'):
        token_id = token.get("id")
        token_text = token.get("text")
        
        for tfr in token.findall('.//tfr'):
            rev_id = tfr.get("rev_id")
            t_value = tfr.get("t")
            
            g_tags = [g.get("v") for g in tfr.findall('.//g')]
            g_tags_str = ', '.join(g_tags) 
            
            data.append({
                'Sentence ID': sentence_id,
                'Source': source_text,
                'Token ID': token_id,
                'Token Text': token_text,
                'Tags': g_tags_str
            })

df = pd.DataFrame(data)

In [5]:
df.head()

Unnamed: 0,Sentence ID,Source,Token ID,Token Text,Tags
0,1,«Школа злословия» учит прикусить язык,1,«,PNCT
1,1,«Школа злословия» учит прикусить язык,2,Школа,"NOUN, inan, femn, sing, nomn"
2,1,«Школа злословия» учит прикусить язык,3,злословия,"NOUN, inan, neut, sing, gent"
3,1,«Школа злословия» учит прикусить язык,4,»,PNCT
4,1,«Школа злословия» учит прикусить язык,5,учит,"VERB, impf, tran, sing, 3per, pres, indc"


In [6]:
le = LabelEncoder()
df['Tags'] = le.fit_transform(df['Tags'])

In [7]:
df = df[['Source', 'Token Text', 'Tags']]

In [8]:
tags_dict = {}

for index, row in df.iterrows():
    token = row['Token Text'].lower()
    tags = row['Tags']

    if token not in tags_dict:
        tags_dict[token] = [] 

    if tags not in tags_dict[token]:
        tags_dict[token].append(tags)

In [9]:
def get_context(row, window=2):
    source = row['Source']
    token = row['Token Text']
    words = source.split()
    try:
        idx = words.index(token)
        start = max(0, idx - window)
        end = min(len(words), idx + window + 1)
        return ' '.join(words[start:end])
    except ValueError:
        return ''

df['context'] = df.apply(get_context, axis=1)

In [10]:
df["text_feature"] = df["Token Text"] + ": " + df["context"]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["text_feature"])
y = df["Tags"]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [12]:
model = RandomForestClassifier(n_estimators=50, random_state=42)
model.fit(X_train, y_train)

In [13]:
def get_context(source, token, window=2):
    words = source.split()
    try:
        idx = words.index(token)
        start = max(0, idx - window)
        end = min(len(words), idx + window + 1)
        return ' '.join(words[start:end])
    except ValueError:
        return ''

In [67]:
def predict_tag(token, source, all_tags=None, le=None):

    context = get_context(source, token)
    combined_text = token + ': ' + context
    features = vectorizer.transform([combined_text])

    probs = model.predict_proba(features)[0]
    model_classes = model.classes_
    
    tag_probs = {model_classes[i]: probs[i] for i in range(len(model_classes))}

    if all_tags is not None:
        all_tags = [type(model_classes[0])(tag) for tag in all_tags]
        tag_probs = {tag: tag_probs.get(tag, 0) for tag in all_tags}
    
    if not tag_probs:
        return None
    
    best_tag = max(tag_probs.items(), key=lambda x: x[1])[0]
    
    return le.inverse_transform([best_tag])[0]

        
def unamb_sent(sentence):
    sentence_split = sentence.split()
    for token in sentence_split:
        token = token.lower()
        token= re.sub(r"\p{P}", '', token)
        try:
            allowed_tags = tags_dict[token]
            tag = predict_tag(token, sentence, allowed_tags, le)
            print(token, ": ", tag)
        except KeyError:
            print(token, ": ", "UNKNOWN_TAG")

In [70]:
sentence = "Идет прекрасный прямой эфир"

unamb_sent(sentence)

идет :  VERB, impf, intr, sing, 3per, pres, indc
прекрасный :  UNKNOWN_TAG
прямой :  NOUN, inan, femn, sing, ablt
эфир :  NOUN, inan, masc, sing, accs


In [69]:
sentence = "Идет прямой эфир"

unamb_sent(sentence)

идет :  VERB, impf, intr, sing, 3per, pres, indc
прямой :  NOUN, inan, femn, sing, ablt
эфир :  NOUN, inan, masc, sing, nomn


In [26]:
sentence = "В эфир он вышел с улыбкой"

unamb_sent(sentence)

в :  PREP
эфир :  NOUN, inan, masc, sing, accs
он :  NPRO, masc, 3per, Anph, sing, nomn
вышел :  VERB, perf, intr, masc, sing, past, indc
с :  PREP
улыбкой :  NOUN, inan, femn, sing, ablt


In [49]:
sentence = "С добрым утром!"

unamb_sent(sentence)

с :  PREP
добрым :  ADJF, Qual, neut, sing, ablt
утром :  NOUN, inan, neut, sing, ablt


In [52]:
sentence = "Этим утром я вернулась домой"

unamb_sent(sentence)

этим :  NPRO, neut, sing, ablt
утром :  ADVB
я :  NPRO, 1per, sing, nomn
вернулась :  VERB, perf, intr, femn, sing, past, indc
домой :  ADVB


In [19]:
sentence = "Их было очень много человек"

unamb_sent(sentence)

их :  ADJF, Fixd, Apro, Anph, plur, nomn
было :  VERB, impf, intr, neut, sing, past, indc
очень :  ADVB
много :  ADVB, Prdx
человек :  NOUN, anim, masc, plur, gent


In [20]:
sentence = "Он тот человек который умер"

unamb_sent(sentence)

он :  NPRO, masc, 3per, Anph, sing, nomn
тот :  ADJF, Subx, Apro, Anph, masc, sing, nomn
человек :  NOUN, anim, masc, sing, nomn
который :  ADJF, Apro, Subx, Anph, masc, sing, nomn
умер :  VERB, perf, intr, masc, sing, past, indc
