In [173]:
import pandas as pd
from collections import Counter
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

In [150]:
!wget https://github.com/UniversalDependencies/UD_Russian-SynTagRus/raw/master/ru_syntagrus-ud-dev.conllu

--2024-03-23 20:12:12--  https://github.com/UniversalDependencies/UD_Russian-SynTagRus/raw/master/ru_syntagrus-ud-dev.conllu
Resolving github.com (github.com)... 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/master/ru_syntagrus-ud-dev.conllu [following]
--2024-03-23 20:12:12--  https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/master/ru_syntagrus-ud-dev.conllu
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14704579 (14M) [text/plain]
Saving to: ‘ru_syntagrus-ud-dev.conllu.1’


2024-03-23 20:12:13 (234 MB/s) - ‘ru_syntagrus-ud-dev.conllu.1’ saved [14704579/1470

In [151]:
path = '/content/ru_syntagrus-ud-dev.conllu'


In [160]:
def conllu_to_dataframe(filepath):
    rows = []
    columns = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC']

    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            if line.startswith('#'):
                continue
            elif line.strip() == "":
                if rows:
                    rows.append({col: None for col in columns})
            else:

                parts = line.strip().split('\t')
                if len(parts) == len(columns):
                    row = {col: part for col, part in zip(columns, parts)}
                    rows.append(row)

    df = pd.DataFrame(rows, columns=columns)

    return df

df = conllu_to_dataframe(path)
df

Unnamed: 0,ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC
0,1,Алгоритм,алгоритм,NOUN,_,Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing,12,nsubj,12:nsubj,SpaceAfter=No
1,2,",",",",PUNCT,_,_,4,punct,4:punct,_
2,3,от,от,ADP,_,_,4,case,4:case,_
3,4,имени,имя,NOUN,_,Animacy=Inan|Case=Gen|Gender=Neut|Number=Sing,1,conj,1:conj|12:nsubj,_
4,5,учёного,ученый,NOUN,_,Animacy=Anim|Case=Gen|Gender=Masc|Number=Sing,4,nmod,4:nmod:gen,_
...,...,...,...,...,...,...,...,...,...,...
162491,12,-,-,PUNCT,_,_,11,punct,11:punct,_
162492,13,большая,большой,ADJ,_,Case=Nom|Degree=Pos|Gender=Fem|Number=Sing,14,amod,14:amod,_
162493,14,редкость,редкость,NOUN,_,Animacy=Inan|Case=Nom|Gender=Fem|Number=Sing,0,root,0:root,SpaceAfter=No
162494,15,.,.,PUNCT,_,_,14,punct,14:punct,_


In [161]:
df = df.dropna(axis='index') #убираем пустые строки

In [162]:
df

Unnamed: 0,ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC
0,1,Алгоритм,алгоритм,NOUN,_,Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing,12,nsubj,12:nsubj,SpaceAfter=No
1,2,",",",",PUNCT,_,_,4,punct,4:punct,_
2,3,от,от,ADP,_,_,4,case,4:case,_
3,4,имени,имя,NOUN,_,Animacy=Inan|Case=Gen|Gender=Neut|Number=Sing,1,conj,1:conj|12:nsubj,_
4,5,учёного,ученый,NOUN,_,Animacy=Anim|Case=Gen|Gender=Masc|Number=Sing,4,nmod,4:nmod:gen,_
...,...,...,...,...,...,...,...,...,...,...
162490,11,внимание,внимание,NOUN,_,Animacy=Inan|Case=Nom|Gender=Neut|Number=Sing,14,nsubj,14:nsubj,_
162491,12,-,-,PUNCT,_,_,11,punct,11:punct,_
162492,13,большая,большой,ADJ,_,Case=Nom|Degree=Pos|Gender=Fem|Number=Sing,14,amod,14:amod,_
162493,14,редкость,редкость,NOUN,_,Animacy=Inan|Case=Nom|Gender=Fem|Number=Sing,0,root,0:root,SpaceAfter=No


In [163]:
targets = df['UPOS'].tolist()

In [168]:
def feats_to_dict(feats):
    if pd.isnull(feats) or feats == "_":
        return {}
    return dict(feature.split('=') for feature in feats.split('|') if '=' in feature)

def extract_features(df, window_size=2):
    features_list = []
    for sentence_id, sentence_df in df.groupby((df['ID'] == '1').cumsum()):
        sentence_df['FEATS_DICT'] = sentence_df['FEATS'].apply(feats_to_dict)

        for i, row in sentence_df.iterrows():
            features = {}
            features['morphological'] = row['FEATS_DICT']

            window_words = sentence_df['FORM'].iloc[max(i-window_size, 0):i].tolist() + \
                           sentence_df['FORM'].iloc[i+1:i+1+window_size].tolist()
            features['window_words'] = window_words

            features['syntax_head'] = row['HEAD']
            features['syntax_deprel'] = row['DEPREL']

            features_list.append(features)

    return features_list

features_list = extract_features(df)

[{'morphological': {'Animacy': 'Inan', 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing'}, 'window_words': [',', 'от'], 'syntax_head': '12', 'syntax_deprel': 'nsubj'}, {'morphological': {}, 'window_words': ['Алгоритм', 'от', 'имени'], 'syntax_head': '4', 'syntax_deprel': 'punct'}, {'morphological': {}, 'window_words': ['Алгоритм', ',', 'имени', 'учёного'], 'syntax_head': '4', 'syntax_deprel': 'case'}, {'morphological': {'Animacy': 'Inan', 'Case': 'Gen', 'Gender': 'Neut', 'Number': 'Sing'}, 'window_words': [',', 'от', 'учёного', 'аль'], 'syntax_head': '1', 'syntax_deprel': 'conj'}, {'morphological': {'Animacy': 'Anim', 'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Sing'}, 'window_words': ['от', 'имени', 'аль', '-'], 'syntax_head': '4', 'syntax_deprel': 'nmod'}]


In [169]:
def flatten_features(features_list):
    flattened_features_list = []
    for features in features_list:
        flattened_features = {}
        # Морфологические признаки
        for key, value in features['morphological'].items():
            flattened_features[f'morph_{key}_{value}'] = 1
        # Преобразование слов в окне
        for i, word in enumerate(features['window_words']):
            flattened_features[f'window_word_{i}'] = word
        # Синтаксическая структура
        flattened_features['syntax_head'] = features['syntax_head']
        flattened_features['syntax_deprel'] = features['syntax_deprel']
        flattened_features_list.append(flattened_features)
    return flattened_features_list


flattened_features_list = flatten_features(features_list)

vectorizer = DictVectorizer(sparse=True)
features_vectorized = vectorizer.fit_transform(flattened_features_list)

In [None]:
targets[27:]

In [171]:
X_train, X_test, y_train, y_test = train_test_split(features_vectorized, targets, test_size=0.2, random_state=42)

In [174]:
model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Точность модели: {accuracy}')
print(classification_report(y_test, y_pred))

Точность модели: 0.9610000651084055


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         ADJ       1.00      1.00      1.00      3012
         ADP       0.99      0.98      0.98      2751
         ADV       1.00      0.99      0.99      1544
         AUX       1.00      1.00      1.00       290
       CCONJ       0.98      1.00      0.99      1124
         DET       1.00      0.97      0.98       864
        INTJ       0.00      0.00      0.00         4
        NOUN       0.92      0.97      0.94      7210
         NUM       0.99      0.93      0.96       349
        PART       0.90      0.99      0.94      1080
        PRON       0.97      0.92      0.94      1499
       PROPN       0.63      0.48      0.55      1073
       PUNCT       1.00      1.00      1.00      5817
       SCONJ       0.99      0.86      0.92       593
         SYM       0.33      0.08      0.13        12
        VERB       1.00      1.00      1.00      3409
           X       0.40      0.12      0.19        32
           _       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
