# Гаркавый Андрей, 494 группа

Задание - пройти контест на Kaggle на распознавание частей речи.

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import cross_val_score
from scipy.sparse import lil_matrix
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 1) Считывание и преобработка

In [2]:
test = pd.read_csv("grammar_data/task2_lemmas_test.csv")
sample_submission = pd.read_csv("grammar_data/task2_lemmas_sample_submission.csv")

In [3]:
test.head()

Unnamed: 0,Id,X
0,1,gettonan
1,2,incidentali
2,3,involtino
3,4,lievi
4,5,comunistizzasse


In [4]:
raw_data = open("grammar_data/task2_lemmas_train.csv", "r").readlines()
train_data = []
for line in raw_data[1:]:
    new_line = line.strip().split(',')[1:]
    new_lines = [new_line[:1] + [x] for x in new_line[1:]]
    new_lines = [[x[0], *x[1].split('+')] for x in new_lines]
    train_data.extend(new_lines)
train = pd.DataFrame(train_data, columns=['X', 'y1', 'y2'])

In [5]:
train[95:105]

Unnamed: 0,X,y1,y2
95,sottostarebbero,sottostare,V
96,completeresti,completare,V
97,criocongelavano,criocongelare,V
98,assalir,assalire,V
99,spoliticizzin,spoliticizzare,V
100,provata,provare,V
101,provata,provato,A
102,zamperemmo,zampare,V
103,incerasti,incerare,V
104,fraintendano,fraintendere,V


## 2) Подбираем модель для восстановления части речи

In [6]:
def cross_val_accuracy(classifier, data, target):
    return np.mean(cross_val_score(classifier, data, target, cv=5))

In [7]:
frac = 1.00
train_part = train.sample(frac=frac, random_state=42)

In [42]:
model2 = LogisticRegression(C=50, n_jobs=2)
hv2 = HashingVectorizer(n_features=2 ** 22, analyzer='char_wb', ngram_range=(1,6))

In [43]:
X2_train = lil_matrix(hv2.fit_transform(train_part['X']))
y2_train = train_part['y2']

In [None]:
cross_val_accuracy(model2, X2_train, y2_train)

## 3) Восстанавливаем части речи

In [44]:
X2_test = lil_matrix(hv2.fit_transform(test['X']))

In [45]:
model2.fit(X2_train, y2_train)

LogisticRegression(C=50, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=2,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [46]:
y2_test = model2.predict(X2_test)

## 4) Подбираем модель для восстановления начальной формы

In [47]:
X1_train = X2_train.copy()
X1_train[:,0:3] = lil_matrix([[1 if x == c else 0 for x in y2_train] for c in ['V', 'N', 'A']]).T

In [48]:
def mcp(x, y):
    ans = 0
    while min(len(x), len(y)) > ans and x[ans] == y[ans]:
        ans += 1
    return ans

def get_new_target(x, y):
    mcp_xy = mcp(x, y)
    return '{} {}'.format(len(x) - mcp_xy, y[mcp_xy:])

y1_train = np.array([get_new_target(x, y) for x, y in zip(train_part['X'], train_part['y1'])]).T

In [51]:
model1 = RandomForestClassifier(n_estimators=10, n_jobs=2)

In [27]:
cross_val_accuracy(model1, X1_train, y1_train)



0.77560515429505994

## 5) Восстанавливаем начальные формы

In [50]:
X1_test = X2_test.copy() # 'A', 'N'
X1_test[:,0:3] = lil_matrix([[1 if x == c else 0 for x in y2_test] for c in ['V', 'N', 'A']]).T

In [None]:
model1.fit(X1_train, y1_train)

In [20]:
y1_test_raw = model1.predict(X1_test)

In [24]:
def return_old_target(x, y):
    splt = y.split()
    if len(splt) == 1:
        len_suffix, suffix = int(splt[0]), ''
        return x[:len(x)-int(len_suffix)] + suffix
    else:
        len_suffix, suffix = y.split()
        return x[:len(x)-int(len_suffix)] + suffix

y1_test = np.array([return_old_target(x, y) for x, y in zip(test['X'], y1_test_raw)]).T

In [37]:
y1_test_plus = np.array([y + '+' for y in y1_test])

In [39]:
ans = pd.DataFrame()
ans['Id'] = test['Id']
ans['Category'] = y1_test_plus + y2_test
ans.head(5)

Unnamed: 0,Id,Category
0,1,gettonare+V
1,2,incidentali+A
2,3,involtare+V
3,4,lievo+N
4,5,comunistizzre+V


In [41]:
ans.to_csv("garkavyy_grammar_logistic_regression.csv", sep=',', index=False)