In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier, RidgeClassifier, LogisticRegression
from sklearn.model_selection import ShuffleSplit, cross_val_score, KFold, train_test_split
from scipy import sparse
import re
import time
%matplotlib inline

In [262]:
import warnings
warnings.filterwarnings("ignore")

In [20]:
%%time
row = 0
first = True
Id = []
X = []
lemmas = []
y = []
train = pd.DataFrame(columns=['Id', 'X', 'y'])
with open('task2_lemmas_train', encoding='utf-8') as f:
    for line in f:
        if (row > 0):
            ws = line.split(',')
            Id.append(ws[0])
            X.append(ws[1])
            lemmas.append(ws[2].split('+')[0])
            y.append(ws[2].split('+')[1][0])
            if (len(ws) > 3):
                Id.append(ws[0])
                X.append(ws[1])
                lemmas.append(ws[3].split('+')[0])
                y.append(ws[3].split('+')[1][0])
        row += 1

Wall time: 269 ms


In [21]:
train = pd.DataFrame([Id, X, lemmas, y]).T
train.columns=['Id', 'X', 'lemmas', 'y']
train

Unnamed: 0,Id,X,lemmas,y
0,1,vergognerete,vergognare,V
1,2,amnistiavate,amnistiare,V
2,3,menomazione,menomazione,N
3,4,sfaldavamo,sfaldare,V
4,5,sfodererei,sfoderare,V
5,6,ascondesti,ascondere,V
6,7,edifichereste,edificare,V
7,8,maschieran,maschiare,V
8,9,transennasser,transennare,V
9,10,computando,computare,V


In [17]:
test = pd.read_csv('task2_lemmas_test')

## predict class

In [225]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=5, max_df=.7,
                             max_features=None,
                             ngram_range=(3, 5),
                             lowercase=False,
                             analyzer='char_wb', 
                             binary=True)
vectorizer.fit(train['X'])
X = vectorizer.transform(train['X'])
X_test = vectorizer.transform(test['X'])
X.shape

(120803, 37412)

In [226]:
clf = LogisticRegression(multi_class='ovr', class_weight='balanced')
clf.fit(X, train['y'])
accuracy_score(train['y'], clf.predict(X))

0.97552213107290386

In [37]:
#X_train, X_test, y_train, y_test = train_test_split(X, train['y'], test_size=.4)

In [38]:
clf = LogisticRegression(multi_class='ovr')
clf.fit(X_train, y_train)
accuracy_score(y_test, clf.predict(X_test))

0.94120690368776128

## predict lemma

### get subdatasets - train_V, train_N, train_A

In [127]:
def shorten_row(df, ending_len):
    i_first = max(0, len(df[0]) - ending_len)
    df[0] = df[0][i_first:].lower()
    df[1] = df[1][i_first:].lower()
    return df

In [196]:
%%time
train_V = train[train['y'] == 'V']
for ending_len in range(3, 10):
    train_V[['X_suf_' + str(ending_len), 'lemma_suf_' + str(ending_len)]] \
        = train_V[['X', 'lemmas']].apply(lambda row: shorten_row(row, ending_len), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


Wall time: 1min 45s


In [213]:
%%time
train_N = train[train['y'] == 'N']
for ending_len in range(3, 10):
    train_N[['X_suf_' + str(ending_len), 'lemma_suf_' + str(ending_len)]] \
        = train_N[['X', 'lemmas']].apply(lambda row: shorten_row(row, ending_len), axis=1)
print(train_N.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


(10240, 18)
Wall time: 11.3 s


In [214]:
%%time
train_A = train[train['y'] == 'A']
for ending_len in range(3, 10):
    train_A[['X_suf_' + str(ending_len), 'lemma_suf_' + str(ending_len)]] \
        = train_A[['X', 'lemmas']].apply(lambda row: shorten_row(row, ending_len), axis=1)
print(train_A.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


(13942, 18)
Wall time: 14.9 s


In [215]:
train_A.head()

Unnamed: 0,Id,X,lemmas,y,X_suf_3,lemma_suf_3,X_suf_4,lemma_suf_4,X_suf_5,lemma_suf_5,X_suf_6,lemma_suf_6,X_suf_7,lemma_suf_7,X_suf_8,lemma_suf_8,X_suf_9,lemma_suf_9
37,38,balenieri,baleniero,A,eri,ero,ieri,iero,nieri,niero,enieri,eniero,lenieri,leniero,alenieri,aleniero,balenieri,baleniero
48,49,osanti,osare,A,nti,re,anti,are,santi,sare,osanti,osare,osanti,osare,osanti,osare,osanti,osare
49,50,rialzata,rialzato,A,ata,ato,zata,zato,lzata,lzato,alzata,alzato,ialzata,ialzato,rialzata,rialzato,rialzata,rialzato
55,56,rivaleggiante,rivaleggiare,A,nte,re,ante,are,iante,iare,giante,giare,ggiante,ggiare,eggiante,eggiare,leggiante,leggiare
61,62,cospiranti,cospirare,A,nti,re,anti,are,ranti,rare,iranti,irare,piranti,pirare,spiranti,spirare,ospiranti,ospirare


# predict lemmas separately for each class

In [263]:
from collections import Counter

def get_most_frequent_lemma(series):
    cntr = Counter(series)
    return cntr.most_common(1)[0][0]

# returns dict: str(word_ending) -> str(lemma_ending)
# df contains columns: X_suf_{i}, lemma_suf_{i}
def get_lemmarization_dict(df):
    x_to_lemms_suffs = dict()
    bad_x_suf_set = set()
    # suf with length 3
    for x_suf in df['X_suf_3'].unique():
        if (df[df['X_suf_3'] == x_suf]['lemma_suf_3'].unique().shape[0] == 1):
            #print(x_suf, ' - very good suf')
            x_to_lemms_suffs[x_suf] = list(df[df['X_suf_3'] == x_suf]['lemma_suf_3'])[0]
        else:
            bad_x_suf_set.add(x_suf)
            x_to_lemms_suffs[x_suf] = get_most_frequent_lemma(df[df['X_suf_3'] == x_suf]['lemma_suf_3'])
    
    start = 4
    end = 10
    # suf with lengths 4..9
    for i in range(start, end):
        x_suf_i = 'X_suf_' + str(i)
        lemma_suf_i = 'lemma_suf_' + str(i)
        for x_suf in df[x_suf_i].unique():
            #if word length < curr suffix length add to dict as an exception
            if (len(x_suf) < i):
                x_to_lemms_suffs[x_suf] = list(df[df[x_suf_i] == x_suf][lemma_suf_i])[0]
                continue

            if (x_suf[1:] in bad_x_suf_set):
                if (df[df[x_suf_i] == x_suf][lemma_suf_i].unique().shape[0] == 1):
                    #print(x_suf, ' - very good suf')
                    x_to_lemms_suffs[x_suf] = list(df[df[x_suf_i] == x_suf][lemma_suf_i])[0]
                else:
                    if (i < end - 1):
                        bad_x_suf_set.add(x_suf)
                    x_to_lemms_suffs[x_suf] = get_most_frequent_lemma(df[df[x_suf_i] == x_suf][lemma_suf_i])
                        
    return x_to_lemms_suffs, bad_x_suf_set

In [264]:
%time dict_N, _ = get_lemmarization_dict(train_N)

Wall time: 18 s


In [265]:
%time dict_A, _ = get_lemmarization_dict(train_A)

Wall time: 21 s


In [266]:
%time dict_V, _ = get_lemmarization_dict(train_V)

Wall time: 12min 27s


In [274]:
#keys = dt.keys()
def get_lemma(string, dt):
    for i in np.arange(9, 2, -1):
        if (string[-i:] in dt.keys()):
            return string[:-i] + dt[string[-i:]]

train_V['lemma_pred'] = train_V['X'].apply(lambda x: get_lemma(x, dict_V))
train_N['lemma_pred'] = train_N['X'].apply(lambda x: get_lemma(x, dict_N))
train_A['lemma_pred'] = train_A['X'].apply(lambda x: get_lemma(x, dict_A))

In [275]:
print(train_V[train_V['lemma_pred'] != train_V['lemmas']].shape[0] / float(train_V.shape[0]))
print(train_N[train_N['lemma_pred'] != train_N['lemmas']].shape[0] / float(train_N.shape[0]))
print(train_A[train_A['lemma_pred'] != train_A['lemmas']].shape[0] / float(train_A.shape[0]))

0.009614887032839652
0.00361328125
0.011619566776646105


# final solution

In [276]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=5, max_df=.7,
                             max_features=None,
                             ngram_range=(3, 5),
                             lowercase=False,
                             analyzer='char_wb', 
                             binary=True)
vectorizer.fit(train['X'])
X = vectorizer.transform(train['X'])
X_test = vectorizer.transform(test['X'])
X.shape

(120803, 37412)

In [277]:
clf = LogisticRegression(multi_class='ovr', class_weight='balanced')
clf.fit(X, train['y'])
accuracy_score(train['y'], clf.predict(X))

0.97552213107290386

In [278]:
test['y'] = clf.predict(X_test)

In [283]:
def get_lemma(string, dt):
    for i in np.arange(9, 2, -1):
        if (string[-i:] in dt.keys()):
            return string[:-i] + dt[string[-i:]]
    # so it is not in the recorded set (0.005 of all data)
    return string

def get_lemmatization(df, dt_V, dt_N, dt_A):
    if (str(df[2]) == 'V'):
        return get_lemma(df[1], dt_V)
    if (str(df[2]) == 'N'):
        return get_lemma(df[1], dt_N)
    if (str(df[2]) == 'A'):
        return get_lemma(df[1], dt_A)
    raise(BaseException('something went wrong' + str(df[2])))

In [284]:
%time test['lemma'] = test[['Id', 'X', 'y']].apply(lambda x: get_lemmatization(x, dict_V, dict_N, dict_A), axis=1)

Wall time: 1.08 s


In [285]:
test[test['lemma'].isnull()].shape[0] / float(test.shape[0])

0.0

In [281]:
test[test['lemma'].isnull()].shape[0] / float(test.shape[0])

0.00509086005192003

In [282]:
test[test['lemma'].isnull()]

Unnamed: 0,Id,X,y,lemma
90,91,viados,N,
539,540,lp,N,
1976,1977,è,N,
2117,2118,reduce,A,
2203,2204,cifri,N,
2218,2219,apostrofo,V,
2720,2721,frattempo,A,
2740,2741,incongrue,A,
3281,3282,alt,N,
3423,3424,break,N,


In [289]:
answer = pd.read_csv('task2_lemmas_sample_submission')
answer['Category'] = test[['lemma', 'y']].apply(lambda x: '+'.join(x), axis=1)
answer.to_csv('task2_logregr_l2_balanced_ngram_3_5_lemma_suf_3_9', index=False)
answer.head()

Unnamed: 0,Id,Category
0,1,gettonare+V
1,2,incidentale+A
2,3,involtare+V
3,4,lievo+N
4,5,comunistizzare+V


In [287]:
test.head()

Unnamed: 0,Id,X,y,lemma
0,1,gettonan,V,gettonare
1,2,incidentali,A,incidentale
2,3,involtino,V,involtare
3,4,lievi,N,lievo
4,5,comunistizzasse,V,comunistizzare
