In [2]:
import pandas as pd
import numpy as np
import re
from collections import Counter

In [3]:
with open('task2_lemmas_train') as train_file:
    Id = np.arange(1, 118641)
    X = np.array([None] * 118640)
    y = np.array([None] * 118640)
    n = np.array([None] * 118640)
    bad = 0
    for ind, line in enumerate(train_file):
        if ind == 0:
            continue
        line = line.strip()
        l = re.split('[,+]', line)
        X[ind - 1] = l[1]
        y[ind - 1] = l[2]
        n[ind - 1] = l[3]
        if len(l) >= 5:
            bad += 1

In [4]:
train = pd.DataFrame({'Id': Id, 'X': X, 'y': y, 'n': n})

In [5]:
train.head()

Unnamed: 0,Id,X,n,y
0,1,vergognerete,V,vergognare
1,2,amnistiavate,V,amnistiare
2,3,menomazione,N,menomazione
3,4,sfaldavamo,V,sfaldare
4,5,sfodererei,V,sfoderare


In [6]:
test = pd.read_csv('task2_lemmas_test')

In [7]:
submission = pd.read_csv('task2_lemmas_sample_submission')

In [8]:
submission.head()

Unnamed: 0,Id,Category
0,1,recidivare+V
1,2,recidivare+V
2,3,recidivare+V
3,4,recidivare+V
4,5,recidivare+V


In [9]:
def common_prefix(w):
    i = 0
    min_len = min(len(w[0]), len(w[1]))
    while i < min_len and w[0][i] == w[1][i]:
        i += 1
    return w[0][:i]

def ending(w):
    i = 0
    min_len = min(len(w[0]), len(w[1]))
    while i < min_len and w[0][i] == w[1][i]:
        i += 1
    return w[0][i:]

In [11]:
train['common'] = list(map(common_prefix, np.array(train[['X', 'y']]))) 

In [12]:
train['ending'] = list(map(ending, np.array(train[['X', 'y']])))
train['true_ending'] = list(map(ending, np.array(train[['y', 'X']])))

In [13]:
verb_endings = list(dict(Counter(train.loc[train['n'] == 'V']['ending']).most_common()).keys())

In [14]:
max_len_of_end = len(max(verb_endings, key=len))

# Существительные

In [15]:
sum(train['n'] == 'N') / train['n'].size

0.080984490896830755

# Прилагательные

In [16]:
sum(train['n'] == 'A') / train['n'].size

0.11097437626432906

### Видим, что глаголов больше всего. Поэтому будет определять
1. Глагол \ не глагол
2. Прилагательное \ не прилагательное

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

НА ВСЕЙ ВЫБОРКЕ

In [19]:
all_data = pd.concat([train['X'], test['X']])

In [20]:
vectorizer = CountVectorizer(ngram_range=(1,9), min_df=10, analyzer='char_wb')
sparse_matrix = vectorizer.fit_transform(all_data)
sparse_matrix

<148301x91171 sparse matrix of type '<class 'numpy.int64'>'
	with 9110064 stored elements in Compressed Sparse Row format>

In [21]:
X_train = sparse_matrix[:train.shape[0]]
y_train = train['n']
X_test = sparse_matrix[train.shape[0]:]

In [22]:
verb_model = LogisticRegression()
verb_model.fit(X_train, y_train)
verb_pred = verb_model.predict(X_test)

In [23]:
train['len_of_false_ending'] = list(map(len, train['ending']))

Построим модели для каждой части речи

In [24]:
verbs_X_test = X_test[verb_pred == 'V']
nouns_X_test = X_test[verb_pred == 'N']
adv_X_test = X_test[verb_pred == 'A']

In [25]:
verbs_X_train = X_train[np.array(train['n'] == 'V')]
verbs_y_cut_train = train.loc[train['n'] == 'V']['len_of_false_ending']
verbs_y_end_train = train.loc[train['n'] == 'V']['true_ending']
noun_X_train = X_train[np.array(train['n'] == 'N')]
noun_y_cut_train = train.loc[train['n'] == 'N']['len_of_false_ending']
noun_y_end_train = train.loc[train['n'] == 'N']['true_ending']
adv_X_train = X_train[np.array(train['n'] == 'A')]
adv_y_cut_train = train.loc[train['n'] == 'A']['len_of_false_ending']
adv_y_end_train = train.loc[train['n'] == 'A']['true_ending']

### Предскажем  глаголы

In [26]:
verb_cut_model = LogisticRegression()
verb_cut_model.fit(verbs_X_train, verbs_y_cut_train)
verb_cut_pred = verb_cut_model.predict(verbs_X_test)

In [27]:
verb_add_model = LogisticRegression()
verb_add_model.fit(verbs_X_train, verbs_y_end_train)
verb_add_pred = verb_add_model.predict(verbs_X_test)

Существительные

In [30]:
noun_cut_model = LogisticRegression()
noun_cut_model.fit(noun_X_train, noun_y_cut_train)
noun_cut_pred = noun_cut_model.predict(nouns_X_test)

In [32]:
noun_add_model = LogisticRegression()
noun_add_model.fit(noun_X_train, noun_y_end_train)
noun_add_pred = noun_add_model.predict(nouns_X_test)

Прилагательные

In [33]:
adv_cut_model = LogisticRegression()
adv_cut_model.fit(adv_X_train, adv_y_cut_train)
adv_cut_pred = adv_cut_model.predict(adv_X_test)

In [34]:
adv_add_model = LogisticRegression()
adv_add_model.fit(adv_X_train, adv_y_end_train)
adv_add_pred = adv_add_model.predict(adv_X_test)

Предскажем часть слова, которую нужно отрезать

In [54]:
def f(cut, add, words, ch):
    ans = [w[:-1*c] + a + '+' + ch for c, a, w in zip(cut, add, words)]
    return ans

In [47]:
verb_cut_pred.shape

(24282,)

In [49]:
verb_add_pred.shape

(24282,)

In [50]:
test.loc[verb_pred == 'V']['X'].shape

(24282,)

In [51]:
submission.loc[verb_pred == 'V'].shape

(24282, 2)

In [55]:
submission.loc[verb_pred == 'V', 'Category'] = f(verb_cut_pred, verb_add_pred, test.loc[verb_pred == 'V']['X'], 'V')

In [58]:
submission.loc[verb_pred == 'N', 'Category'] = f(noun_cut_pred, noun_add_pred, test.loc[verb_pred == 'N']['X'], 'N')

In [59]:
submission.loc[verb_pred == 'A', 'Category'] = f(adv_cut_pred, adv_add_pred, test.loc[verb_pred == 'A']['X'], 'A')

In [60]:
submission

Unnamed: 0,Id,Category
0,1,gettonare+V
1,2,incidentale+A
2,3,involtare+V
3,4,lievo+N
4,5,comunistizzare+V
5,6,vidimare+V
6,7,imbrodre+V
7,8,e+V
8,9,cifrare+V
9,10,compassare+V


In [61]:
submission.to_csv("contest2.tsv", sep=',', index=False)