In [13]:
import pandas as pd
import numpy as np
import re
pd.set_option('display.max_colwidth', None)

df_rtr = pd.read_csv('produced_tables/rtr.csv', sep='\t', lineterminator='\n', encoding = 'utf8', dtype='unicode')
df_rtl = pd.read_csv('produced_tables/rtl.csv', sep='\t', lineterminator='\n', encoding = 'utf8', dtype='unicode')

count = []
exact_phon = []
exact_aff = []
exact_stem = []
exact_rid = []
partial_phon = []
partial_aff = []
partial_stem = []
partial_rid = []
dif_phon = []
dif_aff = []
dif_stem = []
dif_rid = []
unsucc_phon = []
unsucc_aff = []
unsucc_rid = []

def exclude_exact_from_end(string1, string2, rid):
    
    if len(string2) > len(string1):
        return False
    
    result = string1[:-len(string2)] + string1[len(string1) - len(string2):].replace(string2, '')
    if len(string1) == len(result):
        return False
    if string2 != '':
        exact_phon.append(string1)
        exact_aff.append(string2)
        exact_stem.append(result)
        exact_rid.append(rid)
        count.append('o')
    return result


def exclude_exact_from_start(string1, string2, rid):
    
    if len(string2) > len(string1):
        return False
    
    result = string1[:-(len(string1) - len(string2))].replace(string2, '') + string1[len(string2):]
    if len(string1) == len(result):
        return False
    if string2 != '':
        exact_phon.append(string1)
        exact_aff.append(string2)
        exact_stem.append(result)
        exact_rid.append(rid)
        count.append('o')
    return result


def exclude_part_from_end(string1, string2, rid):
    
    if len(string2) > len(string1):
        return False
    
    initial_string2 = string2
    i = 1
    while i <= len(string2):
        if string2[-i] == string1[-1]:
            string2 = string2[: -i]
            result = string1[:-len(string2)]
            if initial_string2 != '':
                partial_phon.append(string1)
                partial_aff.append(initial_string2)
                partial_stem.append(result)
                partial_rid.append(rid)
            return result
        i += 1
    return False


def exclude_part_from_start(string1, string2, rid):
    
    if len(string2) > len(string1):
        return False
    
    initial_string2 = string2
    i = 0 
    while i < len(string2):
        if string2[i] == string1[0]:
            string2 = string2[i:]
            result = string1[len(string2):]
            if initial_string2 != '':
                partial_phon.append(string1)
                partial_aff.append(initial_string2)
                partial_stem.append(result)
                partial_rid.append(rid)
            count.append('o')
            return result
        i += 1
    return False


def exclude_dif_phoneme_from_end(string1, string2, rid):
    
    if len(string2) > len(string1):
        return False
    
    if(string1[len(string1) - len(string2)] == string2[0]):
        result = string1[:len(string1) - len(string2)]
        if(string2 != ''):
            dif_phon.append(string1)
            dif_aff.append(string2)
            dif_stem.append(result)
            dif_rid.append(rid)
            count.append('o')
        return result
    
    return False


def exclude_dif_phoneme_from_start(string1, string2, rid):
    
    if len(string2) > len(string1):
        return False
    
    if(string1[0] == string2[0]):
        result = string1[len(string2):]
        if(string2 != ''):
            dif_phon.append(string1)
            dif_aff.append(string2)
            dif_stem.append(result)
            dif_rid.append(rid)
            count.append('o')
        return result
    
    return False


def complete_suffix_exclude(ph, aff, rid):   
    
    state = exclude_exact_from_end(ph, aff, rid)
    if state:
        return state
    else:
        state = exclude_dif_phoneme_from_end(ph, aff, rid)
        if state:
            return state
        else:
            state = exclude_part_from_end(ph, aff, rid)
            if state:
                return state
            else:
                if(aff != ''):
                    count.append('o')
                    unsucc_phon.append(ph)
                    unsucc_aff.append(aff)
                    unsucc_rid.append(rid)
                return ph


def complete_prefix_exclude(ph, aff, rid):

    state = exclude_exact_from_start(ph, aff,rid)
    if state:
        return state
    else:
        state = exclude_dif_phoneme_from_start(ph, aff,rid)
        if state:
            return state
        else:
            state = exclude_part_from_start(ph, aff,rid)
            if state:
                return state
            else:
                if(aff != ''):
                    count.append('o')
                    unsucc_phon.append(ph)
                    unsucc_aff.append(aff)
                    unsucc_rid.append(rid)
                return ph  


def exclude_affix(str, aff, rid):
    aff = re.findall(r'\w+|[^\w\s]', aff)
    lst = [element.split(':')[-1] for element in str.split(';')]

    for i, entry in enumerate(lst):
        entry = entry.split('|')
        if len(entry) > 1:
            for j, ph in enumerate(entry):
                if aff[0] == '.':
                    entry[j] = complete_suffix_exclude(ph, aff[1], rid)    
                elif aff[-1] == '.':
                    entry[j] = complete_prefix_exclude(ph, aff[0], rid)                   
                elif len(aff) > 2:
                    entry[j] = complete_prefix_exclude(ph, aff[0], rid) 
                    entry[j] = complete_suffix_exclude(ph, aff[-1], rid) 
            lst[i] = '|'.join(entry)
        else:
            if aff[0] == '.':
                entry[0] = complete_suffix_exclude(entry[0], aff[1], rid)
            elif aff[-1] == '.':
                entry[0] = complete_prefix_exclude(entry[0], aff[0], rid)
            elif len(aff) > 2:
                entry[0] = complete_prefix_exclude(entry[0], aff[0], rid)
                entry[0] = complete_suffix_exclude(entry[0], aff[-1], rid)
            lst[i] = entry[0]

    return ';'.join(lst)
    


def make_phon_stem1(row):
    if row['cat_1'] == 'V':
        if row['phon_aff_1'] == np.nan:
            return row['stem_space_1']
        else:
            return exclude_affix(str(row['stem_space_1']), str(row['phon_aff_1']), str(row['rid']))
    else:
        if row['phon_aff_1'] == np.nan:
            return exclude_affix(str(row['para_desyll_1']), '', str(row['rid']))
        else:
            return exclude_affix(str(row['para_desyll_1']), str(row['phon_aff_1']), str(row['rid']))
        
def make_phon_stem2(row):
    if row['cat_2'] == 'V':
        if row['phon_aff_2'] == np.nan:
            return row['stem_space_2']
        else:
            return exclude_affix(str(row['stem_space_2']), str(row['phon_aff_2']), str(row['rid']))
    else:
        if row['phon_aff_2'] == np.nan:
            return exclude_affix(str(row['para_desyll_2']), '', str(row['rid']))
        else:
            return exclude_affix(str(row['para_desyll_2']), str(row['phon_aff_2']), str(row['rid']))


print(len(df_rtr['rid'].unique()))
df_rtl = df_rtl[['lid','stem_space','para_desyll', 'para_phon']]
df_rtr = df_rtr.merge(df_rtl, left_on='lid_1', right_on='lid', how='inner')
df_rtr = df_rtr.merge(df_rtl, left_on='lid_2', right_on='lid', how='inner')

df_rtr = df_rtr.drop(columns=['lid_x', 'lid_y'])
                     
df_rtr.columns = df_rtr.columns.str.replace('stem_space_x', 'stem_space_1').str.replace('stem_space_y', 'stem_space_2').str.replace('para_desyll_x', 'para_desyll_1').str.replace('para_desyll_y', 'para_desyll_2').str.replace('para_phon_x', 'para_phon_1').str.replace('para_phon_y', 'para_phon_2')


#df_rtr = df_rtr[['lid_1', 'graph_1', 'lid_2', 'graph_2', 'cat_1', 'cat_2', 'cstr_1', 'cstr_2', 'phon_aff_1', 'phon_aff_2' ,'stem_space_1', 'para_desyll_1', 'para_phon_1', 'stem_space_2', 'para_desyll_2', 'para_phon_2']]

#display(df_rtr.head(33))

df_rtr['phon_stem_1'] = df_rtr.apply(make_phon_stem1, axis=1)
df_rtr['phon_stem_2'] = df_rtr.apply(make_phon_stem2, axis=1)


df_rtr.to_csv('produced_tables/test.csv', sep='\t', lineterminator='\n', encoding='utf8', index=False)

df_exact = pd.DataFrame({'Phoneme': exact_phon, 'Affix': exact_aff, 'Stem': exact_stem, 'rid': exact_rid})
df_partial = pd.DataFrame({'Phoneme': partial_phon, 'Affix': partial_aff, 'Stem': partial_stem, 'rid': partial_rid})
df_dif = pd.DataFrame({'Phoneme': dif_phon, 'Affix': dif_aff, 'Stem': dif_stem, 'rid': dif_rid})
df_unsuccessful = pd.DataFrame({'Phoneme': unsucc_phon, 'Affix': unsucc_aff, 'rid': unsucc_rid})

#display(df_rtr[['graph_1', 'graph_2', 'cat_1', 'cat_2', 'cstr_1', 'cstr_2', 'phon_aff_1', 'phon_aff_2', 'phon_stem_1', 'phon_stem_2']])

display(df_exact)
display(df_partial)
display(df_dif)
display(df_unsuccessful)
df_unsuccessful.to_csv('produced_tables/stems/not_cases.csv', sep='\t', lineterminator='\n', encoding='utf8', index=False)
df_exact.to_csv('produced_tables/stems/exact_cases.csv', sep='\t', lineterminator='\n', encoding='utf8', index=False)
df_partial.to_csv('produced_tables/stems/partial_cases.csv', sep='\t', lineterminator='\n', encoding='utf8', index=False)
df_dif.to_csv('produced_tables/stems/dif_cases.csv', sep='\t', lineterminator='\n', encoding='utf8', index=False)
print(len(count))

df_rtr = df_rtr.drop(columns=['stem_space_1', 'stem_space_2', 'para_desyll_1', 'para_desyll_2', 'para_phon_1', 'para_phon_2'])
df_rtr.to_csv('produced_tables/rtr.csv', sep='\t', lineterminator='\n', encoding='utf8', index=False)
#display(df_rtr)


print(len(df_exact['rid'].unique()))
print(len(df_partial['rid'].unique()))
print(len(df_dif['rid'].unique()))
print(len(df_unsuccessful['rid'].unique()))

lst_of_un = list(df_unsuccessful['rid'].unique()) + list(df_dif['rid'].unique()) + list(df_partial['rid'].unique()) + list(df_exact['rid'].unique())
len(lst_of_un)

lst_of_un = list(set(lst_of_un))

print(len(lst_of_un))

df_exact_list = list(df_exact['rid'].unique())
df_partial_list = list(df_partial['rid'].unique())
df_dif_list = list(df_dif['rid'].unique())
df_unsuccessful_list = list(df_unsuccessful['rid'].unique())

result = list(set(df_exact_list ) - set(df_partial_list))
result = list(set(result) - set(df_dif_list))
result = list(set(result) - set(df_unsuccessful_list))

df_only_successful = pd.DataFrame({'rid': result})

display(df_only_successful)
df_only_successful.to_csv('produced_tables/stems/only_exact_extracted.csv', sep='\t', lineterminator='\n', encoding='utf8', index=False)
len(result)

90844


Unnamed: 0,Phoneme,Affix,Stem,rid
0,abɛsmã,mã,abɛs,r6088
1,abɛsmã,mã,abɛs,r6088
2,abɛsmã,mã,abɛs,r6090
3,abɛsmã,mã,abɛs,r6090
4,abɛsmã,mã,abɛs,r6092
...,...,...,...,...
166239,dekana,a,dekan,r222073
166240,zezɛt,ɛt,zez,r222095
166241,zezɛt,ɛt,zez,r222095
166242,tupɛ,ɛ,tup,r222105


Unnamed: 0,Phoneme,Affix,Stem,rid
0,bɛsje,ije,bɛs,r181928
1,bɛsje,ije,bɛs,r181928
2,bɛsje,ije,bɛs,r90262
3,bɛsje,ije,bɛs,r90262
4,bɛsje,ije,bɛs,r181926
...,...,...,...,...
21423,ɛ̃tɛʁnaliz,alizɛ,ɛ̃tɛʁna,r222029
21424,ɛ̃tɛʁnaliz,alizɛ,ɛ̃tɛʁna,r222029
21425,ɛ̃tɛʁnaliz,alizɛ,ɛ̃tɛʁna,r222029
21426,ɛ̃tɛʁnaliz,alizɛ,ɛ̃tɛʁna,r222029


Unnamed: 0,Phoneme,Affix,Stem,rid
0,ʁabes,ʁə,bes,r116242
1,ʁabɛs,ʁə,bɛs,r116242
2,ʁabɛs,ʁə,bɛs,r116242
3,ʁabes,ʁə,bes,r116242
4,ʁabɛs,ʁə,bɛs,r116242
...,...,...,...,...
18743,tɔpikalize,izɛ,tɔpikal,r220587
18744,ɛ̃tɛʁnalize,alizɛ,ɛ̃tɛʁn,r222029
18745,ɛ̃tɛʁnalizə,alizɛ,ɛ̃tɛʁn,r222029
18746,ɛ̃tɛʁnaliza,alizɛ,ɛ̃tɛʁn,r222029


Unnamed: 0,Phoneme,Affix,rid
0,bɛsjɛʁ,ije,r90262
1,bɛsjɛʁ,ije,r90262
2,bɛsjɛʁ,ije,r181926
3,bɛsjɛʁ,ije,r181926
4,bɛsjɛʁ,ije,r184838
...,...,...,...
38339,ɛkstʁavɛʁsjɔ̃,jɔ,r221862
38340,ɛkstʁavɛʁsjɔ̃,jɔ,r221862
38341,otodɛstʁyktʁis,œʁ,r221903
38342,otodɛstʁyktʁis,œʁ,r221903


223378
60742
6080
5312
18274
78292


Unnamed: 0,rid
0,r218243
1,r117195
2,r14610
3,r71553
4,r34844
...,...
53193,r14106
53194,r34464
53195,r8786
53196,r110817


53198