In [2]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn import model_selection, preprocessing
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import auc
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import collections
import seaborn as sns
import matplotlib.pyplot as plt
import math
import scipy.stats as sps
%matplotlib inline

In [8]:
data = pd.read_csv('task2_lemmas_train').fillna('')
test = pd.read_csv('task2_lemmas_test', index_col=0).fillna('')

In [9]:
newdata = data
for col in ['y1','y2','y3','y4']:
    newdata = newdata.select(lambda x: len(newdata[col][x]) > 0)
    print(newdata.head())
    newdata['y'] = newdata[col]
    data = data.append(newdata)
olddata = data.copy()

      Id            X                y              y1 y2 y3 y4
100  101      provata        provare+V       provato+A         
227  228  complimento  complimentare+V   complimento+N         
295  296      rampino        rampino+N       rampare+V         
371  372   sussidiari    sussidiario+A   sussidiario+N         
397  398   squalifica     squalifica+N  squalificare+V         
        Id           X             y            y1            y2 y3 y4
1209  1210     passati     passato+A     passato+A     passare+V      
2397  2398       calmo       calmo+A       calmo+A     calmare+V      
3921  3922  crocifissi  crocifisso+N  crocifisso+N  crocifisso+A      
4972  4973     massimo     massimo+N     massimo+N      grande+A      
6673  6674     vestite     vestire+V     vestire+V     vestito+A      
          Id        X           y        y1          y2          y3  \
20393  20394  siedevo  risedere+A  sedere+A  risedere+A  risedere+V   

                   y4  
20393  soprassedere+V  

In [5]:
data = olddata

In [10]:
data = data[['X','y']]
data['PoS'] = data['y'].map(lambda x: x[-1:])
data['y'] = data['y'].map(lambda x: x[:-2])
data.head()

Unnamed: 0,X,y,PoS
0,vergognerete,vergognare,V
1,amnistiavate,amnistiare,V
2,menomazione,menomazione,N
3,sfaldavamo,sfaldare,V
4,sfodererei,sfoderare,V


In [11]:
exp_data = data.copy()
exp_data['X_beg'] = exp_data['X_end'] = exp_data['X']

for x in exp_data.iterrows():
    i = 0
    while(x[1]['X'][i] == x[1]['y'][i]):
        i+=1
        if(i == len(x[1]['y']) or i == len(x[1]['X'])):
            x[1]['X_end'] = x[1]['y'] = ''
            x[1]['X_beg'] = x[1]['X']
            break
    
    if(i < len(x[1]['y'])):
        x[1]['X_beg'] = x[1]['X'][:i-1]
        x[1]['X_end'] = x[1]['X'][i-1:]
        x[1]['y'] = x[1]['y'][i-1:]

In [12]:
exp_data.head(n=200)   

Unnamed: 0,X,y,PoS,X_beg,X_end
0,vergognerete,nare,V,vergog,nerete
1,amnistiavate,are,V,amnisti,avate
2,menomazione,,N,menomazione,
3,sfaldavamo,are,V,sfald,avamo
4,sfodererei,rare,V,sfode,rerei
5,ascondesti,ere,V,ascond,esti
6,edifichereste,care,V,edifi,chereste
7,maschieran,iare,V,masch,ieran
8,transennasser,are,V,transenn,asser
9,computando,are,V,comput,ando


In [13]:
def sortByLength(inputStr):
        return len(inputStr)
endings = np.unique(exp_data[['X_end', 'y']].as_matrix().reshape(-1)).tolist()
endings.sort(key=sortByLength)
print(endings)

['', 'a', 'e', 'i', 'n', 'o', 'r', 'u', 'à', 'ò', 'ae', 'ai', 'an', 'ax', 'ba', 'be', 'bi', 'bo', 'by', 'bè', 'bì', 'bò', 'ca', 'ce', 'ci', 'co', 'cè', 'cì', 'cò', 'da', 'de', 'di', 'do', 'dè', 'dì', 'dò', 'ea', 'ee', 'ei', 'eo', 'eò', 'fa', 'fe', 'fi', 'fo', 'fì', 'fò', 'ga', 'ge', 'gi', 'go', 'gè', 'gì', 'gò', 'ha', 'he', 'hi', 'ho', 'hì', 'ia', 'ie', 'ii', 'in', 'io', 'iè', 'iò', 'ka', 'ke', 'ki', 'ko', 'kò', 'la', 'le', 'li', 'lo', 'lè', 'lì', 'lò', 'ma', 'me', 'mi', 'mo', 'mè', 'mì', 'mò', 'na', 'ne', 'ni', 'no', 'nè', 'nì', 'nò', 'oa', 'oe', 'oi', 'oì', 'pa', 'pe', 'pi', 'po', 'pè', 'pì', 'pò', 'ra', 're', 'ri', 'ro', 'rà', 'rì', 'rò', 'sa', 'se', 'si', 'so', 'sè', 'sì', 'sò', 'ta', 'te', 'ti', 'to', 'ty', 'tè', 'tì', 'tò', 'ua', 'ue', 'ui', 'uo', 'uè', 'uì', 'uò', 'va', 've', 'vi', 'vo', 'vè', 'vì', 'vò', 'za', 'ze', 'zi', 'zo', 'zì', 'zò', 'ìa', 'ìe', 'ìi', 'ìo', 'aia', 'aio', 'amo', 'ano', 'are', 'asa', 'ase', 'asi', 'aso', 'asè', 'ata', 'ate', 'ati', 'ato', 'ava', 'avi', 'avo

In [14]:
var = exp_data.copy()

In [58]:
exp_data['X_end_new'] = exp_data['X'].map(lambda x: [x])
for ending in endings:
    exp_data['X_end_new'] = exp_data['X_end_new'].map(lambda x: x + [ending] if len(x[0]) >= len(ending) and x[0][-len(ending):] == ending else x)
exp_data.head()

Unnamed: 0,X,y,PoS,X_beg,X_end,X_end_new
0,vergognerete,nare+V,V,vergog,nerete,"[vergognerete, e, te, ete, nerete]"
1,amnistiavate,are+V,V,amnisti,avate,"[amnistiavate, e, te, ate, avate]"
2,menomazione,one+N,N,menomazi,one,"[menomazione, e, ne, one]"
3,sfaldavamo,are+V,V,sfald,avamo,"[sfaldavamo, o, mo, amo, avamo]"
4,sfodererei,rare+V,V,sfode,rerei,"[sfodererei, i, ei, rerei]"


In [61]:
exp_data.loc[train['X_end'] == 's']

Unnamed: 0,X,y,PoS,X_beg,X_end,X_end_new
2957,toilettes,+N,N,toilette,s,s
5383,brokers,+N,N,broker,s,s
5526,brioches,+N,N,brioche,s,s
12115,anti-gates,s+A,A,anti-gate,s,s
17308,tennis,s+N,N,tenni,s,s
17315,partners,+N,N,partner,s,s
18500,plazas,+N,N,plaza,s,s
20670,vulnus,s+N,N,vulnu,s,s
22550,rais,s+N,N,rai,s,s
24066,rendez-vous,s+N,N,rendez-vou,s,s


In [60]:
exp_data['X_end_new'] = exp_data['X_end_new'].map(lambda x: x if len(x)>1 else x + [x[0][-1]])
exp_data['X_beg'] = exp_data['X_end_new'].map(lambda x: x[0][:-len(x[-1])])
exp_data['y'] = olddata['y']
exp_data['X_end_new'] = exp_data['X_end_new'].map(lambda x: x[-1] if len(x)>1 else x[0][-1])
for x in exp_data.iterrows():
    x[1]['y'] = x[1]['y'][len(x[1]['X_beg']):]
exp_data.head()

Unnamed: 0,X,y,PoS,X_beg,X_end,X_end_new
0,vergognerete,nare+V,V,vergog,nerete,nerete
1,amnistiavate,are+V,V,amnisti,avate,avate
2,menomazione,one+N,N,menomazi,one,one
3,sfaldavamo,are+V,V,sfald,avamo,avamo
4,sfodererei,rare+V,V,sfode,rerei,rerei


In [62]:
exp_data['X_end'] = exp_data['X_end_new']
train = exp_data[['X_beg', 'X_end', 'y']]
train.head()

Unnamed: 0,X_beg,X_end,y
0,vergog,nerete,nare+V
1,amnisti,avate,are+V
2,menomazi,one,one+N
3,sfald,avamo,are+V
4,sfode,rerei,rare+V


In [83]:
endings = np.unique(exp_data[['X_end', 'y']].as_matrix().reshape(-1)).tolist()
exp_test = test.copy()
exp_test.head()

Unnamed: 0,X,X_beg,X_end,target
1,gettonan,getto,,getto
2,incidentali,incidenta,li,incidenta
3,involtino,invol,tino,invol
4,lievi,li,evi,li
5,comunistizzasse,comunistizz,asse,comunistizz


In [84]:
exp_test['X_end'] = exp_test['X'].map(lambda x: [x])
for ending in endings:
    exp_test['X_end'] = exp_test['X_end'].map(lambda x: x + [ending] if len(x[0]) >= len(ending) and x[0][-len(ending):] == ending else x)
exp_test.head()

Unnamed: 0,X,X_beg,X_end,target
1,gettonan,getto,"[gettonan, an, n, nan]",getto
2,incidentali,incidenta,"[incidentali, i, li]",incidenta
3,involtino,invol,"[involtino, ino, no, o, tino]",invol
4,lievi,li,"[lievi, evi, i, vi]",li
5,comunistizzasse,comunistizz,"[comunistizzasse, asse, e, se]",comunistizz


In [85]:
exp_test['X_beg'] = exp_test['X_end'].map(lambda x: x[0][:-len(x[-1])])
exp_test['X_end'] = exp_test['X_end'].map(lambda x: x[-1] if len(x)>1 else x[0][-1])
exp_test = exp_test[['X', 'X_beg', 'X_end']]
exp_test.head()

Unnamed: 0,X,X_beg,X_end
1,gettonan,getto,
2,incidentali,incidenta,li
3,involtino,invol,tino
4,lievi,lie,vi
5,comunistizzasse,comunistizzas,se


In [86]:
training_set = train[['X_end', 'y']]
training_set.loc[train['X_end'] == 'an']

Unnamed: 0,X_end,y
931,an,are+V
1103,an,are+V
1473,an,are+V
2052,an,are+V
2566,an,are+V
2787,an,are+V
3727,an,are+V
4015,an,are+V
4690,an,are+V
5747,an,are+V


In [87]:
active_endings = set(exp_test['X_end'])
unknown = active_endings.difference(set(endings))
active_endings.intersection_update(set(endings))
active_endings.difference(set(endings))

set()

In [88]:
test = exp_test.copy()
test['target'] = test['X_beg']

for x in test.iterrows():
    if(x[1]['X_end'] in unknown):
        x[1]['target'] = x[1]['X'] + '+N'

test.loc[test['X_end'] == 'j']

Unnamed: 0,X,X_beg,X_end,target
26266,dj,,j,dj+N


In [69]:
cntr = collections.Counter(training_set.loc[training_set['X_end'] == 'ti']['y'].as_matrix())

In [70]:
train['len'] = train['X_beg'].map(lambda x: len(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [71]:
sth = train.copy().loc[train['X_end'] == 'ti']
sth

Unnamed: 0,X_beg,X_end,y,len
1749,comple,ti,to+A,6
3397,vie,ti,tare+V,3
5233,ripe,ti,tere+V,4
11600,liu,ti,to+N,3
12987,inquie,ti,tare+V,6
14229,incompiu,ti,to+N,8
14290,inesaus,ti,to+A,7
15600,poe,ti,ta+N,3
16134,argu,ti,to+A,4
18821,ins,ti,tare+V,3


In [72]:
train.loc[training_set['X_end'] == 'ti']

Unnamed: 0,X_beg,X_end,y,len
1749,comple,ti,to+A,6
3397,vie,ti,tare+V,3
5233,ripe,ti,tere+V,4
11600,liu,ti,to+N,3
12987,inquie,ti,tare+V,6
14229,incompiu,ti,to+N,8
14290,inesaus,ti,to+A,7
15600,poe,ti,ta+N,3
16134,argu,ti,to+A,4
18821,ins,ti,tare+V,3


In [104]:
dd = []
for ending in set(test['X_end']).intersection(set(train['X_end'])):
    trg = test.loc[test['X_end'] == ending]
    trn = train.loc[train['X_end'] == ending]
    val = trn['y'].unique()
    if(len(val) == 1):
        for i in trg.index:
            test['target'][i] = val[0]
    else:
        if(len(trn['X_beg'].unique()) == 1 and trn['X_beg'].unique()[0] == ''):
            dd.append(ending)
            continue
        count_vectorizer = CountVectorizer(min_df=1, max_features=None,ngram_range=(2, 4), analyzer='char_wb', lowercase=False, binary=True) 
        X = count_vectorizer.fit_transform(trn['X_beg'])
        y = trn['y']
        algo = LogisticRegression(penalty='l1', C=0.145, class_weight='balanced')
        algo = algo.fit(X, y)
        X_train = count_vectorizer.transform(trg['X_beg'])
        target = algo.predict(X_train)
        for i, j in enumerate(trg.index):
            test['target'][j] = target[i]

In [110]:
exp_data.loc[train['X_end'] == 'siederebbero']

Unnamed: 0,X,y,PoS,X_beg,X_end,X_end_new
75115,siederebbero,risedere+V,V,,siederebbero,siederebbero
75115,siederebbero,soprassedere+V,V,,siederebbero,siederebbero


In [112]:
test.loc[test['X_end'] == 'siediamo']

Unnamed: 0,X,X_beg,X_end,target,y
21690,co-presiediamo,co-pre,siediamo,co-pre,co-preco-pre


In [105]:
dd

['siedano', 'siederebbero', 'siederai', 'siediamo']

In [107]:
test['y'] = test['X_beg'] + test['target']
test

Unnamed: 0,X,X_beg,X_end,target,y
1,gettonan,getto,,nare+V,gettonare+V
2,incidentali,incidenta,li,le+A,incidentale+A
3,involtino,invol,tino,tare+V,involtare+V
4,lievi,lie,vi,vare+V,lievare+V
5,comunistizzasse,comunistizzas,se,ere+V,comunistizzasere+V
6,vidimerebbe,vidi,merebbe,mare+V,vidimare+V
7,imbrodan,imbroda,n,n+A,imbrodan+A
8,strillar,strilla,r,re+V,strillare+V
9,cifrasti,cifras,ti,to+N,cifrasto+N
10,compassavano,compassa,vano,vano+A,compassavano+A


In [113]:
test['y'][25771] = 'co-presiedere+V'
test['y'][26116] = 'co-presiedere+V'
test['y'][5153] = 'co-presiedere+V'
test['y'][26116] = 'co-presiedere+V'

In [126]:
answer = pd.read_csv('task2_lemmas_sample_submission', index_col=0)
answer['Category'] = test['y']
answer

Unnamed: 0,Category
1,gettonare+V
2,incidentale+A
3,involtare+V
4,lievare+V
5,comunistizzasere+V
6,vidimare+V
7,imbrodan+A
8,strillare+V
9,cifrasto+N
10,compassavano+A


In [128]:
answer.to_csv('task_2_submission', index_label='Id')