In [1]:
from itertools import product
import pandas as pd
import re
from collections import Counter

pd.set_option('display.max_rows', None)

In [2]:
import os
files = os.path.abspath(os.path.join('../files/'))
files

'D:\\central-kurdish-verbs\\files'

In [3]:
def get_verb_text_file_as_df(path):
    with open(f"{files}/{path}", "r", encoding="utf-8") as f:
        lines = f.read().split("\n")

        data = []
        
        for line in lines:
            
            if not line.startswith("#") and not line == "":
                
                splited_line = line.split()
                
                data.append({
                    "Infinitive": splited_line[0],
                    "Past": splited_line[1],
                    "Present": splited_line[2],
                    "Intran|Tran": splited_line[3],
                })
                
        return pd.DataFrame(data)
    
def df_verb_to_text_file(df, path):
    
    
    with open(f"{files}/{path}", "a", encoding="utf-8") as f:
        
        f.write("\n"*5)
        f.write("###"+str(len(df))+"###\n")
        f.write("\n")
        
        for row in df.to_dict("records"):
            infinitive = row["Infinitive"]
            past = row["Past"]
            present = row["Present"]
            intran_tran = row["Intran|Tran"]
            
            f.write(f"{infinitive} {past} {present} {intran_tran}\n")


In [4]:
def clean_text(word):
    look_up = {
        "اا": "ا",
        "ۆا": "وا",
        "ووا": "وا",
        "ەا": "ەیا",
        "ێا": "ێیا",
        "ەێ": "ەیێ",
        "ێێ": "ێیێ",
#         "": "",
#         "": "",
#         "": "",
#         "": "",
#         "": "",
#         "": "",
    }
    
    for key, val in look_up.items():
        if key in word:
            word = word.replace(key, val)
    
    return word

In [5]:
def generated_by_affixes(data_df, prefixes, sufixes):
    
    generated = []

    for row in data_df.to_dict("records")[:]:

        # For infinitives
        infinitive_forms = []
        for form in product(prefixes, [row["Infinitive"]], sufixes):
            form = list(form)

            while "" in form:
                form.remove("")

            form = "".join(form) # for separating infinitives use "_" here too

            if form != row["Infinitive"]:
                infinitive_forms.append(form)

        # For past forms
        past_forms = []
        for form in product(prefixes, [row["Past"]], sufixes):
            form = list(form)

            while "" in form:
                form.remove("")

            form = "_".join(form)

            if form != row["Past"]:
                past_forms.append(form)

        # For present forms
        present_forms = []
        for form in product(prefixes, [row["Present"]], sufixes):
            form = list(form)

            while "" in form:
                form.remove("")

            form = "_".join(form)

            if form != row["Present"]:
                present_forms.append(form)


        for index in range(len(infinitive_forms)):

                generated.append({
                    "Infinitive": infinitive_forms[index],
                    "Past": past_forms[index],
                    "Present": present_forms[index],
                    "Intran|Tran": row["Intran|Tran"],
                })
    
    generated_df = pd.DataFrame(generated)
    
    return generated_df

## Loading Simple Verbs

In [6]:
data_df = get_verb_text_file_as_df("CKB (simple verbs).txt")

In [7]:
data_df.head(10)

Unnamed: 0,Infinitive,Past,Present,Intran|Tran
0,کردن,کرد,کە,1
1,کڕین,کڕی,کڕ,1
2,فڕین,فڕی,فڕ,0
3,گرتن,گرت,گر,1
4,کەوتن,کەوت,کەو,0
5,دان,دا,دە,1
6,بارین,باری,بار,0
7,بەستن,بەست,بەست,1
8,پێوان,پێوا,پێو,1
9,بردن,برد,بە,1


## Generating Complex Verbs

#### Converting Intransitive to Transitive verbs by adding (اندن)

In [8]:
andin = "اندن"
din = "دن"

new_andin_data = []

for row in data_df.to_dict("records"):
    if int(row["Intran|Tran"]) == 0:
        
        new_present = row["Present"]
        if len(new_present)>1:
            new_present = re.sub("ێ$", "", row["Present"])
        
        new_infinitive = new_present+andin
        new_infinitive = clean_text(new_infinitive)

        new_present = new_present+"ێن"
        new_present = clean_text(new_present)
        
        new_andin_data.append({
            "Infinitive": new_infinitive,
            "Past": new_infinitive[:-1],
            "Present": new_present,
            "Intran|Tran": "1",
        })

In [9]:
new_andin_data_df = pd.DataFrame(new_andin_data)
new_andin_data_df.head(5)

Unnamed: 0,Infinitive,Past,Present,Intran|Tran
0,فڕاندن,فڕاند,فڕێن,1
1,کەواندن,کەواند,کەوێن,1
2,باراندن,باراند,بارێن,1
3,باندن,باند,بێن,1
4,چاندن,چاند,چێن,1


In [10]:
df_verb_to_text_file(new_andin_data_df, "CKB (complex verbs).txt")

#### Generating Complex Verbs by adding (ەوە)

In [11]:
new_data_for_awa = data_df.append(new_andin_data_df, ignore_index=True)

In [12]:
new_data_for_awa.head(5)

Unnamed: 0,Infinitive,Past,Present,Intran|Tran
0,کردن,کرد,کە,1
1,کڕین,کڕی,کڕ,1
2,فڕین,فڕی,فڕ,0
3,گرتن,گرت,گر,1
4,کەوتن,کەوت,کەو,0


In [13]:
awa = "ەوە"

new_awa_data = []

for row in new_data_for_awa.to_dict("records"):
        
    infinitive = row["Infinitive"]
    past = row["Past"]
    present = row["Present"]
    intran_tran = row["Intran|Tran"]

    new_awa_data.append({
        "Infinitive": infinitive+awa,
        "Past": past+"_"+awa,
        "Present": (present+"_"+awa),
        "Intran|Tran": intran_tran,
    })

In [14]:
new_awa_data_df = pd.DataFrame(new_awa_data)
new_awa_data_df.head(5)

Unnamed: 0,Infinitive,Past,Present,Intran|Tran
0,کردنەوە,کرد_ەوە,کە_ەوە,1
1,کڕینەوە,کڕی_ەوە,کڕ_ەوە,1
2,فڕینەوە,فڕی_ەوە,فڕ_ەوە,0
3,گرتنەوە,گرت_ەوە,گر_ەوە,1
4,کەوتنەوە,کەوت_ەوە,کەو_ەوە,0


In [15]:
df_verb_to_text_file(new_awa_data_df, "CKB (complex verbs).txt")

#### Generating Complex Verbs by adding (ڕا هەڵ دا ڕۆ وەر ڕێ پێ تێ لێ وێ )

In [16]:
data_df = data_df.append(new_andin_data_df)

In [17]:
prefixes = "ڕا هەڵ دا ڕۆ وەر ڕێ پێ تێ لێ وێ".split()
prefixes.append("")

sufixes = "ەوە".split()
sufixes.append("")

In [18]:
generated_df = generated_by_affixes(data_df, prefixes, sufixes)

In [19]:
generated_df.head(5)

Unnamed: 0,Infinitive,Past,Present,Intran|Tran
0,ڕاکردنەوە,ڕا_کرد_ەوە,ڕا_کە_ەوە,1
1,ڕاکردن,ڕا_کرد,ڕا_کە,1
2,هەڵکردنەوە,هەڵ_کرد_ەوە,هەڵ_کە_ەوە,1
3,هەڵکردن,هەڵ_کرد,هەڵ_کە,1
4,داکردنەوە,دا_کرد_ەوە,دا_کە_ەوە,1


In [20]:
df_verb_to_text_file(generated_df, "CKB (complex verbs).txt")

#### Generating Complex Verbs by adding (پێدا پێوە تێدا تێوە پێڕا تێڕا لێڕا پێک تێک لێک وێک ڕێک )

In [21]:
prefixes = "پێدا پێوە تێدا تێوە پێڕا تێڕا لێڕا پێک تێک لێک وێک ڕێک".split()
prefixes.append("")

sufixes = "ەوە".split()
sufixes.append("")

In [22]:
generated_df = generated_by_affixes(data_df, prefixes, sufixes)

In [23]:
generated_df.head(5)

Unnamed: 0,Infinitive,Past,Present,Intran|Tran
0,پێداکردنەوە,پێدا_کرد_ەوە,پێدا_کە_ەوە,1
1,پێداکردن,پێدا_کرد,پێدا_کە,1
2,پێوەکردنەوە,پێوە_کرد_ەوە,پێوە_کە_ەوە,1
3,پێوەکردن,پێوە_کرد,پێوە_کە,1
4,تێداکردنەوە,تێدا_کرد_ەوە,تێدا_کە_ەوە,1


In [24]:
df_verb_to_text_file(generated_df, "CKB (complex verbs).txt")