## Performing text operations

In [1]:
import os
import spacy 
from spacy.lang.am import Amharic
from collections import Counter
import pandas as pd

nlp = Amharic()
nlp.max_length = 20000000

data_path = r'C:\Users\user\Documents\Data_science\IR real\Demo_data\Updated_Demo'

books = os.listdir(data_path)

tokens = []

for book in books:
    with open(os.path.join(data_path, book), 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            doc = nlp(line)
            tokens.extend([token.text for token in doc if not token.is_punct])

#doc = nlp(sentence)

freq_counter = Counter(tokens) #
df = pd.DataFrame({"words" : freq_counter.keys(), "frequency" : freq_counter.values()}).set_index('words')




## Removing numerical values


In [2]:
df = df.reset_index()
df['words'].str.isnumeric().sum()

17720

### This function removes the number that are found in the string

In [3]:
df = df.loc[~df['words'].str.isnumeric()]
df['words'].str.isnumeric().sum()

0

In [4]:
df.set_index('words', inplace = True)

## Creating frequency and words dataFrame for easier visualization

In [15]:
df.sort_values(by = 'frequency', ascending = False, inplace = True)
df.drop(index = ' ', inplace = True)
df.drop(index = '=', inplace = True)
df.head()

Unnamed: 0_level_0,frequency,Rank,Constant c
words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ነው,38534,1,0.010861
ላይ,20484,2,0.011547
ወደ,19466,3,0.016459
ነበር,15752,4,0.017758
ጊዜ,14461,5,0.020379


## Creating the Rank

In [16]:
rank = list(range(1, len(df) + 1))
df['Rank']= rank

## Calculating the the normalized constant

In [17]:
df['Constant c'] =(df.frequency / (df.frequency.sum())) * df.Rank
df.head()

Unnamed: 0_level_0,frequency,Rank,Constant c
words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ነው,38534,1,0.01088
ላይ,20484,2,0.011568
ወደ,19466,3,0.016489
ነበር,15752,4,0.017791
ጊዜ,14461,5,0.020416


## Saving to csv format  

In [18]:
df.to_csv('for_stat.csv')

## These are the total unique number of words


In [19]:
df.shape[0]

851342

## These are the total number of words


In [20]:
df.frequency.sum()

3541639

## The top 10 words

In [21]:
df.head(10)

Unnamed: 0_level_0,frequency,Rank,Constant c
words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ነው,38534,1,0.01088
ላይ,20484,2,0.011568
ወደ,19466,3,0.016489
ነበር,15752,4,0.017791
ጊዜ,14461,5,0.020416
ግን,13221,6,0.022398
ነገር,12456,7,0.024619
ውስጥ,12289,8,0.027759
ሁሉ,11870,9,0.030164
እንደ,11036,10,0.031161


## Saving the index files to csv
* Why we chose these specific numbers could be illustrated using `Statistical analysis.ipynb`


In [32]:
index = df[15:9000]

## The top 5 of words of the index terms

In [33]:
index.head()

Unnamed: 0_level_0,frequency,Rank,Constant c
words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ስለ,7076,16,0.031967
በኋላ,7066,17,0.033917
ወይም,7000,18,0.035577
ይህ,6711,19,0.036003
እግዚአብሔር,6274,20,0.03543


In [25]:
df.to_csv('index.csv')