In [1]:
import csv
import pprint
import pandas as pd
import pywikibot
import re
from string import punctuation

from caphi2ipa import caphipp2ipa

# To do

* Singular nouns:
    * Add plurals
    * Handle "complex" nouns:
        * Several glosses (';')
        * Complex glosses ('_' or ' ')
    * Add IPA
* Plural nouns
* Adjectives
* Active participles (NOUN_ACT:MS)
* Passive participles (NOUN_PASS)
* INTERJ
* ADV
* NOUN:PHRASE
* Verbs
* VERB:PHRASE
* Issues:
    * شرعية: WARNING: API warning (edit): The value passed for "text" contains invalid or non-normalized data. Textual data should be valid, NFC-normalized Unicode without C0 control characters other than HT (\t), LF (\n), and CR (\r).

# Read TSV

In [4]:
maknuune_file = "maknuune-v1.0.1.tsv"

singular_nouns = ['NOUN:MS', 'NOUN:FS']

df = pd.read_csv(maknuune_file,sep='\t')

# Stats by PoS

In [7]:
df_count = df.groupby(['ANALYSIS'])['ANALYSIS'].count()
df_count.loc[lambda x : x > 50].sort_values(ascending=False)

ANALYSIS
VERB:I             6345
VERB:C             6329
VERB:P             6191
NOUN:MS            4183
NOUN:FS            3050
ADJ:MS             2388
NOUN:PHRASE        2343
NOUN:P             1894
NOUN_ACT:MS         759
VERB:PHRASE         639
NOUN_PASS           305
ADJ:P               288
ADJ:PHRASE          249
NOUN_ACT:PHRASE     117
ADJ/NOUN            108
INTERJ              106
ADJ:FS               78
ADJ_COMP             77
ADV                  71
NOUN_PROP            65
Name: ANALYSIS, dtype: int64

# First: singular nouns deriving from a root and without notes 

In [8]:
# 1355 entries with notes => deal with them later
# df.loc[df['NOTES'].notnull()].count()

# 897 entries with NTWS roots => deal with them later
# df.loc[df['ROOT'] == 'NTWS'].count()

# df['LEMMA_BW'] == df['FORM_BW'] to exclude weird cases for nouns

df_singular_nouns = df.loc[(df['ROOT'] != 'NTWS') & (df['ANALYSIS'].isin(singular_nouns)) & (df['LEMMA_BW'] == df['FORM_BW']) & (df['NOTES'].isnull())]

df_singular_nouns

Unnamed: 0,ID,ROOT,ROOT_NTWS,ROOT_1,LEMMA,LEMMA_SEARCH,FORM,LEMMA_BW,FORM_BW,CAPHI++,ANALYSIS,GLOSS,GLOSS_MSA,EXAMPLE_USAGE,NOTES,SOURCE,ANNOTATOR,Unnamed: 17,Unnamed: 18,Unnamed: 19
0,1,ء.ب.د,,ء,أَبَد,أبد,أَبَد,>abad,>abad,2 a b a d,NOUN:MS,eternity;forever,,,,,شهد دعباس,,,
1,2,ء.ب.ر,,ء,إِبْرِة,إبرة,إِبْرِة,<iborip,<iborip,2 i b r e,NOUN:FS,needle;injection,إِبْرَة#حُقْنَة,في إِبْرِة وقعت تحت الكنب دير بالك.#أخذت ابرة ...,,,شهد دعباس,,,
7,8,ء.ب.ط,,ء,أَبَاط,أباط,أَبَاط,>abaAT,>abaAT,2 a b aa t.,NOUN:MS,armpit,إِبْط,ولك ليش أباطك أسود هيك؟,,,شهد دعباس,,,
8,9,ء.ب.ط,,ء,بَاط,باط,بَاط,baAT,baAT,b aa t.,NOUN:MS,armpit,إِبْط,,,,شهد دعباس,,,
10,11,ء.ب.و,,ء,أَب,أب,أَب,>ab,>ab,2 a b,NOUN:MS,father,أب,الله يرحمه أبوه بقى زلمة مليح وبينشد فيه الظهر,,,شهد دعباس,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36039,36263,ي.م.ن,,ي,يَمِين,يمين,يَمِين,yamiyn,yamiyn,y a m ii n,NOUN:MS,right,يمين,روتش لك شوي لليمين,,,شهد دعباس,,,
36040,36264,ي.م.ن,,ي,يَمِين,يمين,يَمِين,yamiyn,yamiyn,y a m ii n,NOUN:MS,vow;oath,,احلف يَمِين إِنه مالكاش دخَّل!,,,شهد دعباس,,,
36044,36268,ي.م.ن,,ي,يُمُن,يمن,يُمُن,yumun,yumun,y u m u n,NOUN:MS,blessing,نِعْمَة,ألف مبارك الشهر وكل عام وانتو بخير وان شاء الل...,,,شهد دعباس,,,
36056,36280,ي.و.م,,ي,يَومِيِّة,يومية,يَومِيِّة,yawmiy~ip,yawmiy~ip,y oo m i y y e,NOUN:FS,daily_payment,دفع بنظام المياومة,,,,شهد دعباس,,,


# Wiktionary format

In [9]:
beginning_file = """'''User:A455bcd9/Maknuune/{entry}'''
{{{{-start-}}}}"""

template = """==South Levantine Arabic==
{{{{bulk import|{{{{R:ajp:Maknuune|id={id}}}}}}}}}
{{{{ajp-root|{root}}}}}

===Pronunciation===

* {{{{ajp-IPA|{ipa}}}}}

===Noun===
{{{{ajp-noun|head={head}|g={gender}|tr={transliteration}{plural}}}}}

# {glosses}{example}

===References===
* {{{{R:ajp:Maknuune}}}}"""

end_file="{{{{-stop-}}}}"

"""
Plural:

===Noun===
{{ajp-noun|head={head}|g=p|tr={transliteration}}}
# {{plural of|ajp|{singular}|tr={singulartransliteration}|t={singulargloss}}}
# [[definition]]

"""

'\nPlural:\n\n===Noun===\n{{ajp-noun|head={head}|g=p|tr={transliteration}}}\n# {{plural of|ajp|{singular}|tr={singulartransliteration}|t={singulargloss}}}\n# [[definition]]\n\n'

# Select 10 random entries

In [99]:
# df_singular_nouns = df_singular_nouns.sample(n=1000)

# CAPHI++ to Wikt

In [10]:
for index, row in df_singular_nouns.iterrows():
    caphipp = row['CAPHI++'].strip()
    ipa = caphipp2ipa(caphipp, mode='wikt')
    # print(row['LEMMA'], '=>', caphipp, '=>', ipa)
    
# test m a w J uu d => mawjūd (موجود)
# maw(dʒ)uːd // مَوْ ُجود
print(caphipp2ipa('m a w J uu d', mode='wikt'))
# 2 i sh t aa Q => ištāʔ (اشتاق)
# ʔiʃtaː(q)
print(caphipp2ipa('2 i sh t aa Q', mode='wikt'))

mawjūd
ʔištāʔ


# Test entries

In [11]:
test_entries = [
    'أب',
    'أباط',
    'أجار',
    'أجرة',
    'أخذ',
    'ألفة',
    'إيجار',
    'باط',
    'بلع',
    'تأثير',
    'تأجير',
    'تأجيل',
    'تجارة',
    'خرقة',
    'زهاب',
    'عنصرية',
    'غواصة',
    'قذلة',
    'كماجة',
    'لقمة',
    'مهاجر',
    'مواخذة',
    'شرعية',
    'بيئة',
    'كيلو',
    'مفاد',
    'خضرة',
    'نظرة',
    'فناء',
    'إفشاء',
    'عجلة',
    'صحصحة',
    'واعظ',
]

# Save entries in Wiktionary

## Clean glosses

In [40]:
def clean_gloss(gloss):
    # Clean and link glosses
    clean_gloss = gloss
    
    replacements = [
        ("_", " "),
        (r' +', ' ',),
        ('sb/sth', 'somebody or something'),
        ("(?i) sth", " something"),
        ("(?i) sb", " somebody"),
        (r"(?i)^sb ", "somebody "),
        (r"(?i)^sth ", "something "),
        ("(?i) some one ", " someone "),
        (r"(?i)^a type of ", ""),
        (r"(?i)^see phrase(s)?", ""),
        (r"(?i)^a ", ""),
        (r"(?i)^an ", ""),
        (r"(?i)^The ", ""),
        (r"(?i)^It is what ", "what "),
        (r"(?i)^It is (a|the) ", ""),
        (r"(?i)^It is like (a|the) ", ""),
        (' that is made of ', ' made of '),
        (' that is made from ', ' made from '),
        (' a n ', ' an ')
    ]
    
    for replacement in replacements:
        clean_gloss = re.sub(replacement[0], replacement[1], clean_gloss)
        
    clean_gloss = clean_gloss.strip(' .')
    
    return clean_gloss

def link_gloss(gloss):
    # clean gloss
    linked_gloss = clean_gloss(gloss)
    
    # if single word, link it
    if len(linked_gloss.split(' ')) == 1:
        linked_gloss = u"[[{g}]]".format(g=linked_gloss)
    
    return linked_gloss

In [41]:
def test_clean_gloss():
    to_test = [
        ('a_type_of_jar_for_keeping_water_cold', 'jar for keeping water cold'),
        ('see_phrase', ''),
        ('ceramic work.', 'ceramic work'),
        ('sb_who_eats_food_off_the_ground', 'somebody who eats food off the ground'),
        ('a small ball', 'small ball'),
        ('a single person', 'single person'),
        ('a narrow path', 'narrow path'),
        ('a fat person', 'fat person'),
        ('sb/sth', 'somebody or something'),
        ('a difficult  situation', 'difficult situation'),
        ('the grated food', 'grated food'),
        ('the beautiful woman', 'beautiful woman'),
        ('a_stone_inside_a_hole', 'stone inside a hole'),
        ('an individual.', 'individual'),
        ('see phrases', ''),
        ("the_state_of_holding_something_over_sb's_head", "state of holding something over somebody's head"),
        ('a_vessel_that_is_usually_of_brass_for_kohl.', 'vessel that is usually of brass for kohl'),
        ('It_is_what_is_placed_on_the_back_of_the_walking_animal_for_the_purpose_of_riding_on_it', 'what is placed on the back of the walking animal for the purpose of riding on it'),
        ('It_is_the_ritual_animal_sacrifice_of_a_livestock_animal_in_the_first_three_days_of_the_funeral_that_is_believed_to_give_solace_to_the_deceased_person', 'ritual animal sacrifice of a livestock animal in the first three days of the funeral that is believed to give solace to the deceased person'),
        ('A_thin_layer_of_freezing_water_in_the_morning,_due_to_lower_temperatures.', 'thin layer of freezing water in the morning, due to lower temperatures'),
        ('It_is_a_traditional_type_of_dessert_that_is_made_of_dough,_water_and_molasses._It_is_usually_eaten_with_dried_figs.', 'traditional type of dessert made of dough, water and molasses. It is usually eaten with dried figs'),
        ('sth_similar_to_a_container_that_is_made_by_sewing_several_"7_u_s._u_r"_together._The_farmers_usually_keep_wheat_in_it.', 'something similar to a container that is made by sewing several "7 u s. u r" together. The farmers usually keep wheat in it'),
        ('the_person_whose_job_is_to_tell_stories_to_the_people', 'person whose job is to tell stories to the people'),
        ('It_is_like_a_basket_made_of_rubber,_or_of_wicker_and_fiber,_and has two loops that are used as handles for carrying it. People use it to carry multiple things.', 'basket made of rubber, or of wicker and fiber, and has two loops that are used as handles for carrying it. People use it to carry multiple things'),
        ('traditional Levantine dish that is made of chickpeas', 'traditional Levantine dish made of chickpeas'),
        ('small container that is made from skinned sheepskin', 'small container made from skinned sheepskin'),
        ('speak_in_a_n_incomprehensible_way', 'speak in an incomprehensible way'),
    ]
    for unit_test in to_test:
        if clean_gloss(unit_test[0]) != unit_test[1]:
            raise ValueError('Unit test failed:', clean_gloss(unit_test[0]), unit_test[1])
            
    return 'clean_gloss(): OK'

test_clean_gloss()

'clean_gloss(): OK'

In [28]:
# site = pywikibot.Site()
i = 0

for index, row in df_singular_nouns.iterrows():
    # remove _[auto] from entries generated automatically from other entries
    clean_gloss_row = row['GLOSS'].replace('_[auto]', '').strip()
    # filter out more complex glosses
    entry_values = {}
    entry_values['id'] = row['ID']
    entry_values['entry'] = row['LEMMA_SEARCH'].strip()
    entry_values['root'] = row['ROOT'].strip().replace('.', ' ')
    entry_values['head'] = row['FORM'].strip()
    entry_values['gender'] = 'f' if row['ANALYSIS'] == 'NOUN:FS' else 'm'

    caphipp = row['CAPHI++'].strip()
    entry_values['transliteration'] = caphipp2ipa(caphipp, mode='wikt')
    entry_values['ipa'] = entry_values['transliteration']
    
    entry_values['plural'] = ''

    entry_values['example'] = ''
    if not pd.isna(row['EXAMPLE_USAGE']):
        example = row['EXAMPLE_USAGE'].strip()
        # Fill example if not null
        if example != '':
            entry_values['example'] = '\n#: {{{{ux|ajp|{example}}}}}'.format(example=example)

    # Separate glosses
    glosses = set(clean_gloss_row.split(';'))
        
    to_save = False
    if '_' not in clean_gloss_row and ' ' not in clean_gloss_row:
        entry_values['glosses'] = '[[' + ']], [['.join(glosses) + ']]'
    else:
        # Could check whether English n-grams (glosses) are already in the Wiktionary and link them
        # https://en.wiktionary.org/w/api.php?action=query&titles=test&format=json

        # add link for simple glosses in the list
        entry_values['glosses'] = ', '.join([clean_gloss(x) for x in glosses])
        print(entry_values['id'], entry_values['glosses'])
        to_save = False

    new_entry = template.format(**entry_values)

    # page_name = u"User:A455bcd9/Maknuune/{entry}".format(entry=entry_values['entry'])
    # page = pywikibot.Page(site, page_name)

    # page.text = new_entry
    if to_save:
        print(new_entry)
        # page.save(u"Entries with multiple glosses and examples")
        i += 1    
            
    if i > 20:
        break

print('end')

28 father Pope, daddy
30 father of, having a particular quality, possessor of
36 furnishing (a place)
66 Hasanat (Credit for good deeds, which Allah weighs up against one's bad deeds at the final judgement after death)
107 taking something
119 Day of the Judgment
122 stable (horses)
123 obligation, in the form of money or possessions paid by the groom, to the bride at the time of Islamic divorce
161 have the spirit of brotherhood (usually between a man and a woman)
177 brother of, brother
206 Adhan, call for prayer
207 Adhan, call for prayer
208 caller of prayer
255 land, territory, land plot
256 land, territory, land plot
283 It is an alternative form of the cloak made of white linen fabric or pure cotton
301 state of being a teacher, professorship
382 0.5 pound
386 getting used to something, acclimatizing oneself to something
456 making something up, composing, lying, authoring
496 prince, noble person
534 Imam is an Islamic leadership position. Imam is usually the person who leads t

10113 craving something
10114 trickling down, dribbling down
10125 dessert dish consists of two layers: the first consists of: milk, sugar, starch, semolina, and cream, Madlouka (dessert), second layer consists of water, sugar, lemon, and kunafa dough. Adorned with almonds, pine nuts and pistachios
10137 soapstone that is used to flatten the ground
10146 being guided, being directed, using something as a piece of evidence
10172 state of being very cloudy, leaden, overcast
10183 key hook
10184 teabag, key hook
10185 grape tree
10186 grape leaves
10187 varicose disease
10201 rummaging through something
10207 short coat worn over the qunbaaz, which has long sleeves
10221 big stone
10262 bracelet made of gold, bracelet
10264 cloak with golden embroidery
10289 cloak with golden embroidery
10290 cloak with golden embroidery
10417 running over something or somebody
10436 pedlar, person who walks from place to place (sometimes he rides a donkey) selling small things
10458 livestock feed contai

19117 large shallow basin made of metal or plastic and that is used for kneeding, laundry or bathing
19118 large shallow basin made of metal or plastic and that is used for kneeding, laundry or bathing
19131 very small pot in an open-top earthen oven known as "Tabun"
19154 horn
19200 state of being full
19212 water barrel
19220 embarrassing situation
19232 bothering somebody and coercing him into leaving something or somebody
19240 women's hadband that is collocated by coins in two rows. In the back, four pieces of coins are collocated and are usually larger than coins placed in the front
19241 upper part of the boundary wall/fence that seperates between territories
19268 ritual, special habit
19282 door knocker
19283 door knocker
19298 
19306 cracking sound
19343 habitual visit
19352 going out for a picnic or shopping, uphill
19356 rope used to climb the palm tree
19393 small window on top of a room
19395 number of times being divorced
19398 labor pain
19411 (not) at all, absolutely, 

26926 arguing with somebody in a disrespectful way
26935 Islamic Sacrifices that the people of the neighboring villages bring with them for funerals or weddings
26944 container gardening
26945 plants' pot
26946 hollowing something out, scooping something out
26947 vegetable corer
26953 gains, stolen money
26961 vessel made of straw and that has a cover. People use it to take food or fruit to other people's house
26971 
26973 
27097 dry leaves that ignite fire
27104 barter deal
27131 status, social class
27151 depression, deep sadness
27180 Kibbe, kubba (kibbeh is usually made by pounding bulgur wheat together with meat into a fine paste and forming it into balls with toasted pine nuts and spices)
27181 pouring rain, throwing something, throwing something in the trash
27182 forming something into a ball (similar to Kibbe)
27183 Kabab (it consists cooked meat dish, with its origins in Middle Eastern cuisines)
27184 throwing something into the trash
27199 sexual frustration
27209 sheep's 

35508 temptation, instilling suspicion
35526 about to
35571 taking somebody from one place to another, ride, picking up somebody
35572 one of several songs or other items in a performance, each followed by an interval
35573 parvenu, state of being opportunist
35602 order, making an order
35616 Wudu (ablution)
35637 humble, state of being down to earth
35665 residence, native country
35675 domiciliation, settling somebody somewhere
35742 preaching people, giving religious sermons and advice
35754 mild illness
35758 enlightening, raising awareness, making aware
35837 passing away, death
35838 meeting (conditions or requirements)
35884 mistake, misstep, bad situation where other people are involved, obstacle, falling down, trouble
35915 charity to be named after somebody
35929 hat made of the cloth of the dress worn and decoratively embroidered. Some golden or silver coins are attached to it. It is tied with a string from under the chin, and is usually worn on social occasions
35931 ounce

In [42]:
"""
issues:

see_phrases
see_phrase:
33175
33177 
33179 
735
968
1212
2202
13973

New line:
2955 state of being filled to the max or stuffed
6194 traditional dish that is made of unripe grapes and lentils
35498 being handsome
"""

'\nissues:\n\nsee_phrases\nsee_phrase:\n33175\n33177 \n33179 \n735\n968\n1212\n2202\n13973\n\nNew line:\n2955 state of being filled to the max or stuffed\n6194 traditional dish that is made of unripe grapes and lentils\n35498 being handsome\n'