In [8]:
import numpy as np
import matplotlib.pyplot as pyplot
import scipy.stats as sps
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import pandas as pd
import scipy.sparse as sprs
from sklearn.feature_extraction.text import CountVectorizer
%matplotlib inline

In [9]:
FORMS = ['V','N','A']
def fftransform(y):
    for i in range(3):
        if y == FORMS[i]:
            return i
def ffrevesetransform(y):
    return FORMS[y]

In [10]:
file = open("task2_lemmas_train",'r')
train_x = []
train_y1 = []
train_y2 = []
for line in file:
    line = line[:-1]
    words = line.split(',')
    x = words[1]
    y_arr = words[2:]
    for y in y_arr:
        train_x.append(x)
        train_y1.append(y.split('+')[0])
        train_y2.append(fftransform(y.split('+')[1]))
file.close()
test_data = list((pd.read_csv("task2_lemmas_test"))['X'])

## TRAIN

In [11]:
vectorizer = CountVectorizer(min_df=2,max_df=.9,
                             max_features=None, ngram_range=(2,10),
                             lowercase=True,analyzer='char_wb',
                             binary=False)
vectorizer.fit(train_x)
features = vectorizer.transform(train_x)

In [12]:
%%time
regressor_type = LogisticRegression(penalty='l2',
                               class_weight='balanced',
                               max_iter=800,
                               n_jobs=4,
                               multi_class='ovr',
                               C=20)
regressor_type.fit(features,train_y2)

CPU times: user 4min 12s, sys: 2.57 s, total: 4min 14s
Wall time: 2min 9s


In [17]:
# %%time 
# print(cross_val_score(regressor_type,features,train_y2,scoring='accuracy',n_jobs=4))

[ 0.95784119  0.95736867  0.95930319]
CPU times: user 692 ms, sys: 202 ms, total: 895 ms
Wall time: 1min 37s


In [13]:
def build_train_classes(X,Y):
    classes = []
    for i in range(len(X)):
        form = X[i]
        inf = Y[i]
        pos = 0
        for j in range(min(len(form),len(inf))):
            pos = j+1
            if form[j] != inf[j]:
                pos -= 1
                break 
        a = ""
        if pos < len(form):
            a = form[pos:]
        b = ""
        if pos < len(inf):
            b = inf[pos:]
        classes.append(str(len(a))+','+ b)
    return classes

In [14]:
def split_into_form_clusters(X,Y,is_need_ids=False):
    xclasses = [[],[],[]]
    ids = [[],[],[]]
    for i in range(len(X)):
        xclasses[Y[i]].append(X[i])
        if is_need_ids:
            ids[Y[i]].append(i)
    if is_need_ids:
        return xclasses,ids
    return xclasses

In [15]:
xclasses_train, ids_train = split_into_form_clusters(train_x,train_y2,is_need_ids=True)
yclasses_train = split_into_form_clusters(train_y1,train_y2)
featuresclasses_train = [vectorizer.transform(xclasses_train[i]) for i in range(3)]
classes_train = [build_train_classes(xclasses_train[i],yclasses_train[i])for i in range(3)]

In [16]:
%%time
regressor_form = [LogisticRegression(penalty='l2',
                               class_weight='balanced',
                               max_iter=800,
                               n_jobs=4,
                               multi_class='ovr', C=3) for i in range(3)]
for i in range(3):
    regressor_form[i].fit(featuresclasses_train[i],classes_train[i])

CPU times: user 46min 25s, sys: 36.1 s, total: 47min 1s
Wall time: 23min 49s


In [17]:
%%time
regressor_form2 = [LogisticRegression(penalty='l2',
                               class_weight='balanced',
                               max_iter=800,
                               n_jobs=4,
                               multi_class='ovr', C=20) for i in range(3)]
for i in range(3):
    regressor_form2[i].fit(featuresclasses_train[i],classes_train[i])

CPU times: user 51min 41s, sys: 36.2 s, total: 52min 17s
Wall time: 26min 23s


## TEST

In [19]:
features_test = vectorizer.transform(test_data)
types = regressor_type.predict(features_test)
# type_probs = regressor_type.predict_proba(features_test)

In [113]:
form_prediction_probs = []
for i in range(3):
    form_prediction_probs.append(regressor_form[i].predict_proba(features_test))

In [114]:
classes_probs = [[np.max(form_prediction_probs[j][i]) * type_probs[i][j] for j in range(3)]
                 for i in range(len(test_data))]
classes_ids = [[np.argmax(form_prediction_probs[j][i]) for j in range(3)]
                 for i in range(len(test_data))]


In [115]:
chosen_classes = []
for i in range(len(test_data)):
    chosen_classes.append([])
    probs = classes_probs[i]
    is_class_found = False
    for j in range(3):
        if probs[j] > 0.3 :
            chosen_classes[i].append([j,classes_train[j][classes_ids[i][j]]])
            is_class_found = True
    if not is_class_found:
        type_id = np.argmax(type_probs[i])
        class_id = np.argmax(classes_probs[i][type_id])
        chosen_classes[i].append([type_id,classes_train[type_id][class_id]])

In [88]:
print(sum(len(c) > 1  for c in chosen_classes))

672


In [89]:
answer = []
for i in range(len(test_data)):
    answer.append('')
    for c in chosen_classes[i]:
        w = build_ans_word(test_data[i],c[1],c[0])
        if len(answer[i]) != 0:
            answer[i] += ','
        answer[i] += w 

### clusters classification

In [20]:
test_xclusters, test_ids = split_into_form_clusters(test_data,types,is_need_ids=True)
test_featuresclasses = [vectorizer.transform(test_xclusters[i]) for i in range(3)]
form_prediction_classes = []
for i in range(3):
    form_prediction_classes.append(regressor_form[i].predict(test_featuresclasses[i]))

In [21]:
def build_ans_word(w,res,ftype):
    a = res.split(',')
    toslice = int(a[0])
    toappend = a[1]
    return w[0:len(w) - toslice] + toappend + '+' + ffrevesetransform(ftype)

In [22]:
def build_answer(X,res,ftype):
    ans = []
    for i in range(len(res)):
        x = res[i]
        a = x.split(',')
        toslice = int(a[0])
        toappend = a[1]
        w = X[i]
        ans.append(w[0:len(w) - toslice] + toappend + '+' + ffrevesetransform(ftype))
    return ans

In [23]:
results = [build_answer(test_xclusters[i],form_prediction_classes[i],i) for i in range(3)]

In [24]:
answer = []
ans_ids = []
for i in range(3):
    for j in range(len(results[i])):
        answer.append(results[i][j])
        ans_ids.append(test_ids[i][j]+1)

## OUT

In [91]:
f = open("submission_hand.txt","w")
for i in range(len(answer)):
    f.write(str(i+1) + ',' + answer[i] + '\n')
f.close()

In [25]:
submission = pd.read_csv("task2_lemmas_sample_submission")
submission['Id'] = ans_ids
# submission['Id'] = np.arange(1,len(answer)+1)
submission['Category'] = answer
submission.to_csv("submission.txt", sep=',', index=False)
submission.head(10)

Unnamed: 0,Id,Category
0,1,gettonare+V
1,3,involtare+V
2,5,comunistizzare+V
3,6,vidimare+V
4,7,imbrodare+V
5,8,strillare+V
6,9,cifrare+V
7,10,compassare+V
8,11,cuciare+V
9,12,snobbare+V


## RESEARCH

In [152]:
source = set()
cnt = 0
print(len(train_x))
print(len(test_data))
for x in train_x:
    source.add(x)
for x in test_data:
    if x not in source:
        cnt += 1
cnt

120897
29661


29661

In [3]:
test_subm = list(pd.read_csv("submission_test.txt")['Category'])
new_subm = list(pd.read_csv("submission.txt")['Category'])

In [7]:
accuracy_score(test_subm,new_subm)

0.99345942483395699