# 1 Scrape/load data

In [None]:
# Load Data
import create_record
import pandas as pd

t1s114a1 = create_record.collect_data(1,108,1)
t1s114a1["Text"] = " ".join([i for i in [i.get_text() for i in t1s114a1["SoupContent"]]])

madhahib_df = pd.read_csv("./data/madhahib.csv", sep="\t", index_col=0)
tafasir_df = pd.read_csv("./data/tafasir.csv", sep="\t", index_col=0)


In [None]:
##########################################################
# offline workaround
import pandas as pd
t1s114a2 = pd.read_csv('C:/Users/anaconda/Desktop/altafsir_scraper/corpus/1_1_7.csv', sep=",", index_col=0).T
t1s114a2

t1s114a1 = {}
t1s114a1["Text"] = t1s114a2["Text"][0]
###########################################################

____
# 2 Preprocessing
## 2.1 Normalizing data

In [None]:
from camel_tools.utils.normalize import normalize_unicode
from camel_tools.utils.normalize import normalize_alef_maksura_ar
from camel_tools.utils.normalize import normalize_alef_ar
from camel_tools.utils.normalize import normalize_teh_marbuta_ar
from camel_tools.utils.dediac import dediac_ar


# def remove_citations(text):
#     import re
#     text_without_citations = re.sub("([\[\{]).*?([\]\}])", "", text)
    
#     return text_without_citations

def reduce_to_archarset(text):
    import re
    # Remove non-arabic characters
    nonarab_chars = '[^\u0621-\u064A ]'
    text = re.sub(nonarab_chars, '', text)
    return text

## normalize
def normalizer(string):
    string = normalize_unicode(string)
    string_normalized = normalize_alef_ar(string)
    string_normalized = normalize_alef_maksura_ar(string_normalized)
    string_normalized = normalize_teh_marbuta_ar(string_normalized)
    
    # remove diacritica
    string_normalized = dediac_ar(string_normalized)
    
    #reduce to arabic charset
    string_normalized = reduce_to_archarset(string_normalized)
    return string_normalized

t1s114a1["TextNormalized"] = normalizer(t1s114a1["Text"])

## 2.2 Tokenization

### 2.2.1 Simple Tokenization
Will tokenize words by splitting the string on whitespace and punctuation.

In [None]:
from camel_tools.tokenizers.word import simple_word_tokenize

t1s114a1["Tokenized"] = simple_word_tokenize(t1s114a1["TextNormalized"])

### 2.2.2 Morphological tokenization
The morphological tokenizer expects pre-tokenized text in a list. Therefore run simple_word_tokenize(string) first

In [None]:
from camel_tools.disambig.mle import MLEDisambiguator
from camel_tools.tokenizers.morphological import MorphologicalTokenizer


# Load a pretrained disambiguator to use with a tokenizer
mle = MLEDisambiguator.pretrained('calima-msa-r13')

# `split=True`: morphological tokens are output as seperate strings.
tokenizer = MorphologicalTokenizer(mle, scheme='d3tok', split=True)
t1s114a1["TokenizedMorph"] = tokenizer.tokenize(t1s114a1["Tokenized"])

# Rausfiltern unselbständiger Morpheme
t1s114a1["TokenizedMorph"] = [token for token in t1s114a1["TokenizedMorph"] if not '+' in token]

In [None]:
t1s114a1["TokenizedMorph"]

## [2.3 Remove stopwords from token list]
normalization needs to implemented for stoplist, before usable

In [None]:
def stopwords(token_list, stopwords_list, morph=True):

    token_list_filtered = []
    words_removed = []
    with open (stopwords_list, encoding="UTF-8") as stopwords:
        if morph == True:
            x = stopwords.read()
        else:
            x = normalizer(stopwords.read())
        for token in token_list:
            if token not in x:
                token_list_filtered.append(token)
            if token in x:
                words_removed.append(token)

    return token_list_filtered, words_removed

In [None]:
t1s114a1["TokenizedMorphStopword"] = stopwords(t1s114a1["TokenizedMorph"], 'C:/Users/anaconda/Desktop/arabic-stop-words-master/list2.txt')[0]
t1s114a1["TokenizedStopword"] = stopwords(t1s114a1["Tokenized"], 'C:/Users/anaconda/Desktop/arabic-stop-words-master/list2.txt', morph=False)[0]


----
# 3 Analyzing data
## 3.1 Morphological analysis of tokens

In [None]:
import pandas as pd
from camel_tools.morphology.database import MorphologyDB
from camel_tools.morphology.analyzer import Analyzer


# set up morphological analyzer
db = MorphologyDB.builtin_db("calima-msa-r13")
analyzer = Analyzer(db)

# create dictionary for every token taking index value in t1s114a1["tokenized"] as key
t1s114a1_token_analysis = {}
for i, val in enumerate(t1s114a1["Tokenized"]):
    t1s114a1_token_analysis[i] = pd.DataFrame(analyzer.analyze(t1s114a1["Tokenized"][i]))


In [None]:
t1s114a1_token_analysis[2]


## 3.2 Frequency analysis

In [None]:
# create with root as key and frequency count as val
def frequency_analyzer(token_list):
    token_freqs = {}
    for token in token_list:
        if token in token_freqs:
            token_freqs[token] += 1
        else:
            token_freqs[token] = 1

    # sorting
    token_freqs = {k: v for k, v in sorted(token_freqs.items(), key=lambda item: item[1], reverse=True)}
    return token_freqs

### 3.2.1 ... of morphologically tokenized tokens

In [None]:
frequency_analyzer(t1s114a1["TokenizedMorphStopword"])
# frequency_analyzer(t1s114a1["TokenizedStopword"])

### 3.2.2 ... of roots

In [None]:
# create list of roots for the lemma with max probability for a given token from tokens_analysis
t1s114a1["RootsList"] = []
for i in range(1, len(t1s114a1["Tokenized"])):
    if "lex_logprob" in t1s114a1_token_analysis[i]:
        t1s114a1["RootsList"].append(t1s114a1_token_analysis[i][t1s114a1_token_analysis[i].lex_logprob == t1s114a1_token_analysis[i].lex_logprob.max()].iloc[0]["root"])


In [None]:
frequency_analyzer(t1s114a1["RootsList"])

## 3.3 Named-entity recognition
Hint: it seems that simple tokenization is more apt for NER, as it doesn't remove enclitics like "ك" from parts of the name

In [None]:
from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.ner import NERecognizer

ner = NERecognizer.pretrained()

# NERecognizer expects pre-tokenized text
sentence = t1s114a1["Tokenized"] # simple_word_tokenize(t1s114a1["Text"])

labels = ner.predict_sentence(sentence)

# save each token paired with it's NER label
zipped = list(zip(sentence, labels))


In [None]:
# Filter and glue named entities into dictionary
named_entities = {"LOC" : [], "ORG" : [], "PERS" : [], "MISC" : []}
for i, val in enumerate(zipped):
    if zipped[i][1][0] == "B":
        named_entities[zipped[i][1][2:]].append(zipped[i][0])
    if zipped[i][1][0] == "I":
        named_entities[zipped[i][1][2:]][-1] = named_entities[zipped[i][1][2:]][-1] + " " + zipped[i][0]
named_entities["MISC"]