# Stemming


## Approach 1: using Supervised learning

Let's proof how `RandomForest` could predict a stem of a word without actually being trained on that specific word

In [1]:
un_stemmed = [
  "ወንበር",
  "ወንበሬ",
  "ወንበርህ",
  "ወንበርሽ",
  "ወንበሩ",
  "ወንበሯ",
  "ወንበራችን",
  "ወንበራችሁ",
  "ወንበራቸው",
  "ወንበሮች",
  "ወንበሮቼ",
  "ወንበሮችህ",
  "ወንበሮችሽ",
  "ወንበሮቹ",
  "ወንበሮቿ",
  "ወንበሮቻችን",
  "ወንበሮቻችሁ",
  "ወንበሮቻቸው",
  "ልጅ",
  "የልጅ",
  "ልጆች",
  "ልጆቻችን",
  "ልጆቻቸው",
  "ልጆቻቸውን",
  "ቤቶች",
  "ውል",
  "ጎረቤቶቻችን",
  "ፈለገ",
  "ፈለገችው",
  "ፈላለገ",
  "ሰዳደበ",
  "ቀዳደደ",
  "ሸፋፈነ",
  "ቅጠል",
  "ቅጠሎች",
  "ቅጠላቅጠል",
  "ወጣወጥ",
  "ስለማይለወጥ",
  "ተበላ",
  "ተፈላለገ",
  "ተመካከረ",
  "ትቢተኛነት",
  "ተፎካከረ",
  "ተመሳሰለ",
  "ተስተካከለ", 
  "ተንከባከበ", 
  "መስተካከል", 
  "መንከባከብ",
  "ጠቢብ",
  "ጠባብ",
  "ትምህርት",
  "በኢትዮጵያ",
  "የኢትዮጵያ",
  "ዳቦዎች",
  "ዳቦ",
]
stemmed = [
  "ወንበር",
  "ወንበር",
  "ወንበር",
  "ወንበር",
  "ወንበር",
  "ወንበር",
  "ወንበር",
  "ወንበር",
  "ወንበር",
  "ወንበር",
  "ወንበር",
  "ወንበር",
  "ወንበር",
  "ወንበር",
  "ወንበር",
  "ወንበር",
  "ወንበር",
  "ወንበር",
  "ልጅ",
  "ልጅ",
  "ልጅ",
  "ልጅ",
  "ልጅ",
  "ልጅ",
  "ቤት",
  "ውል",
  "ጎረቤት",
  "ፈለገ",
  "ፈለገ",
  "ፈለገ",
  "ሠደበ",
  "ቀደደ",
  "ሸፈነ",
  "ቅጠል",
  "ቅጠል",
  "ቅጠል",
  "ወጥ",
  "ለወጥ",
  "በላ",
  "ፈለገ",
  "መከረ",
  "ትብኢት",
  "ፎከረ",
  "መሠለ",
  "ሥሥትከለ",
  "ንከባከበ",
  "ተከል",
  "መንከባከብ",
  "ጠቢብ",
  "ጠባብ",
  "ትምህርት",
  "ኢትዮጵይአ",
  "ኢትዮጵይአ",
  "ዳቦ",
  "ዳቦ",
]

In [2]:
import pandas as pd
for_stem = pd.DataFrame({"before" : un_stemmed, 'After' : stemmed})
for_stem.head(20)

Unnamed: 0,before,After
0,ወንበር,ወንበር
1,ወንበሬ,ወንበር
2,ወንበርህ,ወንበር
3,ወንበርሽ,ወንበር
4,ወንበሩ,ወንበር
5,ወንበሯ,ወንበር
6,ወንበራችን,ወንበር
7,ወንበራችሁ,ወንበር
8,ወንበራቸው,ወንበር
9,ወንበሮች,ወንበር


In [3]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier   


model_0 = Pipeline([
    ('tifdf', TfidfVectorizer()),
    ('clf', RandomForestClassifier())
])

model_0.fit(for_stem.before, for_stem.After)

 As we can see `ወንበሮቻችን` is in the trained data so it predicts it correctly or it memorizes the data. But what will happen if we remove `ወንበሮቻችን` from the trained data and tried to predict the stem.


In [4]:
model_0.predict(['"ወንበሮቻችን"'])

array(['ወንበር'], dtype=object)

In [1]:
un_stemmed2 = [
  "ወንበር",
  "ወንበሬ",
  "ወንበርህ",
  "ወንበርሽ",
  "ወንበሩ",
  "ወንበሯ",
  "ወንበራችን",
  "ወንበራችሁ",
  "ወንበራቸው",
  "ወንበሮች",
  "ወንበሮቼ",
  "ወንበሮችህ",
  "ወንበሮችሽ",
  "ወንበሮቹ",
  "ወንበሮቿ",
  "ወንበሮቻችሁ",
  "ወንበሮቻቸው",
  "ልጅ",
  "የልጅ",
  "ልጆች",
  "ልጆቻችን",
  "ልጆቻቸው",
  "ልጆቻቸውን",
  "ቤቶች",
  "ውል",
  "ጎረቤቶቻችን",
  "ፈለገ",
  "ፈለገችው",
  "ፈላለገ",
  "ሰዳደበ",
  "ቀዳደደ",
  "ሸፋፈነ",
  "ቅጠል",
  "ቅጠሎች",
  "ቅጠላቅጠል",
  "ወጣወጥ",
  "ስለማይለወጥ",
  "ተበላ",
  "ተፈላለገ",
  "ተመካከረ",
  "ትቢተኛነት",
  "ተፎካከረ",
  "ተመሳሰለ",
  "ተስተካከለ", 
  "ተንከባከበ", 
  "መስተካከል", 
  "መንከባከብ",
  "ጠቢብ",
  "ጠባብ",
  "ትምህርት",
  "በኢትዮጵያ",
  "የኢትዮጵያ",
  "ዳቦዎች",
  "ዳቦ",
]
stemmed2 = [
  "ወንበር",
  "ወንበር",
  "ወንበር",
  "ወንበር",
  "ወንበር",
  "ወንበር",
  "ወንበር",
  "ወንበር",
  "ወንበር",
  "ወንበር",
  "ወንበር",
  "ወንበር",
  "ወንበር",
  "ወንበር",
  "ወንበር",
  "ወንበር",
  "ወንበር",
  "ልጅ",
  "ልጅ",
  "ልጅ",
  "ልጅ",
  "ልጅ",
  "ልጅ",
  "ቤት",
  "ውል",
  "ጎረቤት",
  "ፈለገ",
  "ፈለገ",
  "ፈለገ",
  "ሠደበ",
  "ቀደደ",
  "ሸፈነ",
  "ቅጠል",
  "ቅጠል",
  "ቅጠል",
  "ወጥ",
  "ለወጥ",
  "በላ",
  "ፈለገ",
  "መከረ",
  "ትብኢት",
  "ፎከረ",
  "መሠለ",
  "ሥሥትከለ",
  "ንከባከበ",
  "ተከል",
  "መንከባከብ",
  "ጠቢብ",
  "ጠባብ",
  "ትምህርት",
  "ኢትዮጵይአ",
  "ኢትዮጵይአ",
  "ዳቦ",
  "ዳቦ",
]

#### After removing `ወንበሮቻችን`

In [7]:
for_stem2 = pd.DataFrame({"before" : un_stemmed2, 'After' : stemmed2})
for_stem2.head(20)

Unnamed: 0,before,After
0,ወንበር,ወንበር
1,ወንበሬ,ወንበር
2,ወንበርህ,ወንበር
3,ወንበርሽ,ወንበር
4,ወንበሩ,ወንበር
5,ወንበሯ,ወንበር
6,ወንበራችን,ወንበር
7,ወንበራችሁ,ወንበር
8,ወንበራቸው,ወንበር
9,ወንበሮች,ወንበር


#### It makes the right prediction but this process is not feasible because we could not get all the list of words before and after being stemmed. This is one of the issues of working with Amharic language

In [8]:
for_stem_proof = pd.DataFrame({"before" : un_stemmed2, 'After' : stemmed2})
model_0 = Pipeline([
    ('tifdf', TfidfVectorizer()),
    ('clf', RandomForestClassifier())
])

model_0.fit(for_stem_proof.before, for_stem_proof.After)
model_0.predict(['ወንበሮቻችን'])

array(['ወንበር'], dtype=object)

## Approach 2: Rule based

#### Let's demonstrate how stemming could work using a small dataset

In [16]:
trial = []
Amharic_suffix = ['ቻችን','ቻችሁ','ቻቸው','ቻቸውን','ህ','ሽ','ችን','ችሁ','ቸው','ች','ቼ','ቿ', 'ችው', 'ቹ']
Amharic_prefix = ['የ', 'በ']
for word in un_stemmed[:20]:
    for suffix in Amharic_suffix:
        word = word.removesuffix(suffix)
    for prefix in Amharic_prefix:
        word = word.removeprefix(prefix)
    print(word)

ወንበር
ወንበሬ
ወንበር
ወንበር
ወንበሩ
ወንበሯ
ወንበራ
ወንበራ
ወንበራ
ወንበሮ
ወንበሮ
ወንበሮ
ወንበሮ
ወንበሮ
ወንበሮ
ወንበሮ
ወንበሮ
ወንበሮ
ልጅ
ልጅ


In [21]:
def stemmer(word):
    for suffix in Amharic_suffix:
        word = word.removesuffix(suffix)
    for prefix in Amharic_prefix:
        word = word.removeprefix(prefix)
    return word

In [18]:
to_stem = pd.read_csv('Normalized.csv')
to_stem.head()

Unnamed: 0.1,Unnamed: 0,words,frequency,Rank,Constant c
0,86822,ነው,38647,1,0.013328
1,22581,ላይ,20484,2,0.014128
2,137129,ወደ,19466,3,0.020139
3,86470,ነበር,15752,4,0.021729
4,179928,ጊዜ,14461,5,0.024935


In [20]:
words = list(to_stem.words)
words[:5]

['ነው', 'ላይ', 'ወደ', 'ነበር', 'ጊዜ']

In [22]:
stemmed_words = []
for word in words:
    processed = stemmer(word)
    stemmed_words.append(processed)

In [23]:
to_stem['Stemmed words'] = stemmed_words
to_stem.head()

Unnamed: 0.1,Unnamed: 0,words,frequency,Rank,Constant c,Stemmed words
0,86822,ነው,38647,1,0.013328,ነው
1,22581,ላይ,20484,2,0.014128,ላይ
2,137129,ወደ,19466,3,0.020139,ወደ
3,86470,ነበር,15752,4,0.021729,ነበር
4,179928,ጊዜ,14461,5,0.024935,ጊዜ


In [24]:
to_stem.to_csv('Stemmed words.csv')

## Approach 3: using the library `HornMorpho`

* We will be using `HornMorpho` from `https://github.com/hltdi/HornMorpho?tab=readme-ov-file` to stem our words


In [3]:
import hm
import threading
import pandas as pd



@@@@ This is HornMorpho+, version 5.0 @@@@



In [7]:
def stemmer(word):
    try:
        return (hm.anal('a', word)[0]['lemma'])
    except:
        return (word)

### To demonstrate how the function works

In [8]:
print(stemmer("ጎረቤቶቻችን"))

ጎረቤት


In [9]:
print(stemmer("ስለማይለወጥ"))

ተለወጠ


In [3]:
toBe_stemmed = pd.read_csv('for_stat.csv')
toBe_stemmed.drop(['Rank','Constant c'], axis= 1, inplace= True)


In [5]:
import threading
def process_chunk(df_chunk):
    df_chunk['stemmed'] = df_chunk["words"].apply(lambda x: stemmer(x))
    return df_chunk

num_threads = 90000
chunk_size = len(toBe_stemmed) // num_threads

df_chunks = [toBe_stemmed.iloc[i:i+chunk_size] for i in range(0, len(toBe_stemmed), chunk_size)]

threads = []
for chunk in df_chunks:
    t = threading.Thread(target=process_chunk, args=(chunk,))
    t.start()
    threads.append(t)

for t in threads:
    t.join()

toBe_stemmed['stemmed'] = pd.concat([process_chunk(chunk) for chunk in df_chunks])['stemmed']

Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛLoading FSTs for አማርኛ

Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛLoading FSTs for አማርኛ

Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛLoading FSTs for አማርኛ

Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FSTs for አማርኛ
Loading FS