# Indexing 

In [47]:
Amharic_suffix = ['ቻችን','ቻችሁ','ቻቸው','ቻቸውን','ህ','ሽ','ችን','ችሁ','ቸው','ች','ቼ','ቿ', 'ችው', 'ቹ', 'ን']
Amharic_prefix = ['የ', 'በ', 'ለ', 'ከ']
def stemmer(word):
    for suffix in Amharic_suffix:
        word = word.removesuffix(suffix)
    for prefix in Amharic_prefix:
        word = word.removeprefix(prefix)
    return word

## The below code reads the extracted text

In [48]:
import os
import spacy 
from spacy.lang.am import Amharic
from collections import Counter
import pandas as pd

nlp = Amharic()
nlp.max_length = 20000000

data_path = r'C:\Users\user\Documents\Data_science\IR real\Demo_data\Updated_Demo'

books = os.listdir(data_path)


token_file = {}

for book in books:
    with open(os.path.join(data_path, book), 'r', encoding='utf-8') as file:
        for line in file:
            tokens = []
            line = line.strip()
            doc = nlp(line)
            tokens.append([token.text for token in doc if not token.is_punct])
            for token in tokens:
                for word in token:
                    if not word.isnumeric() and not word.isspace():
                        if word not in token_file:
                            token_file[word] = []
                        token_file[word].append(book)

## This is a way to represent the directory path of the books using numbers.

In [49]:
data_path = r'C:\Users\user\Documents\Data_science\IR real\Demo_data\Updated_Demo'

Doc_dict = {}
books = os.listdir(data_path)
index = 0
for book in books:
    Doc_dict[book] = index
    index += 1

## This counts the number of document frequency for each word

In [50]:
words = token_file.keys()
documents = token_file.values()
Doc_index = []
for document in documents:
    indi_Doc = []
    for ind_doc in document:
        index = Doc_dict[ind_doc]
        indi_Doc.append(index)
    Doc_index.append(indi_Doc)
Ifile = pd.DataFrame({"Term" : words, "Doc" : Doc_index})
Ifile[170:175]


Unnamed: 0,Term,Doc
170,ጠቢብ,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
171,እነዚህን,"[0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, ..."
172,ከመስማት,"[0, 0, 3, 3, 4, 8, 12, 15, 15, 22, 24, 31, 50,..."
173,ጥበብን,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
174,ይጨምራል,"[0, 0, 0, 12, 13, 22, 22, 27, 30, 39, 43, 48, ..."


### In the below code we will normalize the words. But letter we will show the stemmed/normalized words yields the same result with the unstemmed/un-normalized

In [51]:
Ifile['stemmed'] = Ifile['Term'].apply(lambda x: stemmer(x))
Ifile[170:175]

Unnamed: 0,Term,Doc,stemmed
170,ጠቢብ,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",ጠቢብ
171,እነዚህን,"[0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, ...",እነዚህ
172,ከመስማት,"[0, 0, 3, 3, 4, 8, 12, 15, 15, 22, 24, 31, 50,...",መስማት
173,ጥበብን,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",ጥበብ
174,ይጨምራል,"[0, 0, 0, 12, 13, 22, 22, 27, 30, 39, 43, 48, ...",ይጨምራል


### By using this simple line of code we can cleary classify count the document frequency and term frequency.

In [52]:
Ifile['CF'] = Ifile['Doc'].apply(len)
Ifile['DF'] = Ifile['Doc'].apply(lambda x: len(set(x)))
Ifile[170:175]

Unnamed: 0,Term,Doc,stemmed,CF,DF
170,ጠቢብ,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",ጠቢብ,45,12
171,እነዚህን,"[0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, ...",እነዚህ,602,82
172,ከመስማት,"[0, 0, 3, 3, 4, 8, 12, 15, 15, 22, 24, 31, 50,...",መስማት,27,19
173,ጥበብን,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",ጥበብ,72,34
174,ይጨምራል,"[0, 0, 0, 12, 13, 22, 22, 27, 30, 39, 43, 48, ...",ይጨምራል,64,25


In [53]:
Ifile['Doc'] = Ifile['Doc'].apply(lambda x: set(x))
Ifile[170:175]

Unnamed: 0,Term,Doc,stemmed,CF,DF
170,ጠቢብ,"{0, 4, 8, 73, 42, 48, 124, 50, 83, 85, 92, 31}",ጠቢብ,45,12
171,እነዚህን,"{0, 1, 2, 3, 4, 5, 8, 10, 11, 12, 13, 14, 17, ...",እነዚህ,602,82
172,ከመስማት,"{0, 3, 4, 8, 12, 15, 22, 24, 31, 50, 69, 88, 1...",መስማት,27,19
173,ጥበብን,"{0, 3, 4, 8, 13, 15, 16, 17, 19, 22, 30, 31, 4...",ጥበብ,72,34
174,ይጨምራል,"{0, 12, 13, 22, 27, 30, 39, 43, 48, 49, 55, 58...",ይጨምራል,64,25
