## Importing the extracted data

In [2]:
import pandas as pd 
df = pd.read_csv('for_stat.csv')
df.head()

Unnamed: 0,words,frequency,Rank,Constant c
0,ነው,38534,1,0.01088
1,ላይ,20484,2,0.011568
2,ወደ,19466,3,0.016489
3,ነበር,15752,4,0.017791
4,ጊዜ,14461,5,0.020416


### Further analsys shows that words with frequencies of 1 are useless so we drop them

In [3]:
df[df['frequency']==1]

Unnamed: 0,words,frequency,Rank,Constant c
203030,ቴኔቀ,1,203031,0.057327
203031,›።ወ+ቲቕቹ,1,203032,0.057327
203032,ቴቕፊዛ,1,203033,0.057327
203033,ቴህሁመሠህ,1,203034,0.057328
203034,ልሃ”“ሄዛ,1,203035,0.057328
...,...,...,...,...
851337,ከብዛሻውኀ,1,851338,0.240380
851338,ይጉደል,1,851339,0.240380
851339,ፃናጧለውኀ,1,851340,0.240380
851340,ናጧያ2ስተጋቡና,1,851341,0.240381


In [4]:
df = df[df['frequency'] != 1]

## why we need Normalization

In [5]:
df.loc[df['words'].str.contains('ው')].head()

Unnamed: 0,words,frequency,Rank,Constant c
0,ነው,38534,1,0.01088
7,ውስጥ,12289,8,0.027759
11,ሰው,9261,12,0.031379
30,ናቸው,4765,31,0.041708
43,ያለው,3263,44,0.040538


In [6]:
df.loc[df['words'].str.contains('ዉ')].head()

Unnamed: 0,words,frequency,Rank,Constant c
1971,ዉ,172,1972,0.09577
3113,ነዉ,113,3114,0.099356
8145,ዉሃ,44,8146,0.101203
9826,ተዉ,36,9827,0.099889
10729,ሰዉ,33,10730,0.099979


### As we can see form  the above `ነዉ` and `ነው` are expressing the same word but using different forms to write. So we need to normalize it.

## Creating a dictionary to Normalize the text

In [7]:
dict1 = {
    "ሐ": "ሀ",
    "ሑ": "ሁ",
    "ሒ": "ሂ",
    "ሓ": "ሃ",
    "ሔ": "ሄ",
    "ሕ": "ህ",
    "ሖ": "ሆ",
    "ኀ": "ሀ",
    "ኁ": "ሁ",
    "ኂ": "ሂ",
    "ኃ": "ሃ",
    "ኄ": "ሄ",
    "ኅ": "ህ",
    "ኆ": "ሆ"
}

dict2 = {
    "ሠ": "ሰ",
    "ሡ": "ሱ",
    "ሢ": "ሲ",
    "ሣ": "ሳ",
    "ሤ": "ሴ",
    "ሥ": "ስ",
    "ሦ": "ሶ",
    "ሧ": "ሷ"
}

dict3 = {
    "ዐ": "አ",
    "ዑ": "ኡ",
    "ዒ": "ኢ",
    "ዓ": "ኣ",
    "ዔ": "ኤ",
    "ዕ": "እ",
    "ዖ": "ኦ"
}

dict4 = {
    "ጸ": "ፀ",
    "ጹ": "ፁ",
    "ጺ": "ፂ",
    "ጻ": "ፃ",
    "ጼ": "ፄ",
    "ጽ": "ፅ",
    "ጾ": "ፆ"
}

dict5 = {
    'ዉ' : 'ው'
}

# Merge the dictionaries
merged_dict = {}
merged_dict.update(dict1)
merged_dict.update(dict2)
merged_dict.update(dict3)
merged_dict.update(dict4)
merged_dict.update(dict5)


## Normalization

In [8]:
df.shape

(203030, 4)

#### Notice how the two words will be the same after normalization

In [9]:
df.loc[df['words']==('ነው')]

Unnamed: 0,words,frequency,Rank,Constant c
0,ነው,38534,1,0.01088


In [10]:
df.loc[df['words']==('ነዉ')]

Unnamed: 0,words,frequency,Rank,Constant c
3113,ነዉ,113,3114,0.099356


#### We remove other columns. We will only use columns words and frequency because the others will be altered

In [11]:
col = ['words', 'frequency']
df_to_Normalize = df[col].copy()

#### As you can see the number of columns changed from 203030 to 197121 after normalization

In [12]:
def replace_chars(word):
    for char, value in merged_dict.items():
        word = word.replace(char, value)
    return word

df['words'] = df['words'].apply(lambda x: replace_chars(x))


df_Normalize = df.groupby('words')['frequency'].sum().reset_index()
df_Normalize.shape

(197121, 2)

#### Notice how the frequence is the addition of the two frequences of `ነዉ` and `ነው`

In [13]:
df_Normalize.loc[df_Normalize['words']==('ነው')]

Unnamed: 0,words,frequency
86821,ነው,38647


#### Now we re-calculate the rank and constant c

In [14]:
df_Normalize.sort_values(by = 'frequency', ascending = False, inplace = True)

In [15]:
rank = list(range(1, len(df_Normalize) + 1))
df_Normalize['Rank']= rank

In [16]:
df_Normalize['Constant c'] =(df_Normalize.frequency / (df_Normalize.frequency.sum())) * df_Normalize.Rank
df_Normalize.head()

Unnamed: 0,words,frequency,Rank,Constant c
86821,ነው,38647,1,0.013357
22580,ላይ,20484,2,0.014159
137128,ወደ,19466,3,0.020184
86469,ነበር,15752,4,0.021777
179927,ጊዜ,14461,5,0.02499


### We will save the normalized data in csv format

In [17]:
df_Normalize.to_csv('Normalized.csv')

In [18]:
index = df_Normalize[15:9000]

### We will save the normalized index in csv format

In [20]:
index.to_csv('Normalized_index.csv')
