**Python packages**

In [None]:
!pip install alphabet-detector
!pip install pyarabic
!pip install nltk
!pip install rapidfuzz
!pip install jarowinkler
!pip install jellyfish
!pip install xlsxwriter
import pandas
import numpy
import re
import hashlib
from alphabet_detector import AlphabetDetector
import pyarabic.araby as araby
import nltk
from nltk import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
nltk.download('punkt')
from rapidfuzz import process, fuzz
from rapidfuzz.process import cdist
from jarowinkler import jarowinkler_similarity
import jellyfish
import networkx as nx

**Normalization**

In [4]:
def normalize_hamza(text):
    # Alif-Hamza
    text = re.sub("[إأٱآا]", "ا", text)
    # Waw-Hamza
    text = re.sub("ؤ", "ء", text)
    # Yay-Hamza
    text = re.sub("ئ", "ء", text)
    return text

def remove_accents(raw_text):
    raw_text = re.sub(u"[àáâãäå]", 'a', raw_text)
    raw_text = re.sub(u"[èéêë]", 'e', raw_text)
    raw_text = re.sub(u"[ìíîï]", 'i', raw_text)
    raw_text = re.sub(u"[òóôõö]", 'o', raw_text)
    raw_text = re.sub(u"[ùúûü]", 'u', raw_text)
    raw_text = re.sub(u"[ýÿ]", 'y', raw_text)
    raw_text = re.sub(u"ß", 'ss', raw_text)
    raw_text = re.sub(u"ñ", 'n', raw_text)
    raw_text = re.sub(u"ç", 'c', raw_text)
    return raw_text

def normalize(list_n):
    # Convert to String
    string = numpy.vectorize(str)
    list_n = string(list_n)
    # Lower case
    lower = numpy.vectorize(numpy.char.lower)
    list_n = lower(list_n)
    # Remove arabic diacritics
    diactrics = numpy.vectorize(araby.strip_diacritics)
    list_n = diactrics(list_n)
    # Remove arabic tatweel
    tatweel = numpy.vectorize(araby.strip_tatweel)
    list_n = tatweel(list_n)
    # Normalize types of hamza
    hamza = numpy.vectorize(normalize_hamza)
    list_n = hamza(list_n)
    # Remove accents
    accents = numpy.vectorize(remove_accents)
    list_n = accents(list_n)
    # Remove punctuations
    punctuation = numpy.vectorize(re.sub)
    list_n = punctuation(r'\W', ' ', list_n)   
    # Remove numbers
    punctuation = numpy.vectorize(re.sub)
    list_n = punctuation(r'\d+', '', list_n)
    return list_n

def sort(name):
    name = TreebankWordDetokenizer().detokenize(sorted(word_tokenize(name))) 
    return name

**Transliteration**

In [9]:
def load_():
    # Read dictionary
    dictionary = pandas.read_csv("database.csv", usecols=['arabic', 'roman'])
    arabic = numpy.array(dictionary['arabic'])
    roman = numpy.array(dictionary['roman'])
    # Define transliterator
    to_latin = {
        'ا': 'a', 'ء': 'a', 'إ': 'i', 'ى': 'a', 'بو': 'bou', 'ب': 'b', 'تو': 'tou', 'ت': 't', 'ثو': 'thou', 'ث': 'th',
        'جو': 'djou', 'ج': 'dj', 'حو': 'hou', 'حي': 'hy',
        'ح': 'h', 'خو': 'khou', 'خ': 'kh', 'دو': 'dou', 'د': 'd', 'ذو': 'dou', 'ذ': 'd', 'رو': 'rou', 'ري': 'ry',
        'ر': 'r', 'زو': 'zou', 'ز': 'z', 'سو': 'sou', 'س': 's',
        'شو': 'chou', 'ش': 'ch', 'صو': 'sou', 'ص': 's', 'ضو': 'dou', 'ض': 'd', 'طو': 'tou', 'ط': 't', 'ظو': 'dou',
        'ظ': 'd', 'غو': 'ghou', 'غ': 'gh', 'فو': 'fou', 'ف': 'f',
        'قو': 'qou', 'ق': 'q', 'كو': 'kou', 'ك': 'k', 'لو': 'lou', 'ل': 'l', 'مو': 'mou', 'م': 'm', 'نو': 'nou',
        'ن': 'n', 'هو': 'hou', 'ه': 'h', 'وو': 'wou', 'و': 'w',
        'يو': 'you', 'يي': 'yi', 'ي': 'i', 'ع': 'a', 'ة': 'a', 'ھ': 'h', 'ی': 'a', ' ':' '
    }
    return arabic, roman, to_latin

def translit_arabic(text, arabic, roman, to_latin):
    text = text.split(' ')
    for part in text:
        # Use dictionary
        if part in arabic:
            new = roman[arabic == part][0]
        # Use transliterator
        else:
            for k, v in to_latin.items():
                new = re.sub(k, v, part)
        text =list(map(lambda x: x.replace(part, new), text))
    text = ' '.join(text)
    return text

****

**Arabic Name Disambiguation**

In [25]:
def main(list_n):  
    # Normalization
    list_n = normalize(list_n)
    list_n = [sort(name) for name in list_n]
    # Transliteration
    arabic, roman, to_latin = load_() 
    list_n = [translit_arabic(name, arabic, roman, to_latin) for name in list_n]
    # Fuzzy matching
    distances = cdist(list_n, list_n, scorer=jarowinkler_similarity, score_cutoff=0.85, dtype=numpy.uint8, workers=-1)
    indx_l = numpy.stack(numpy.where(distances >= 0.85), axis = 1)
    rates = numpy.array([jellyfish.jaro_distance(jellyfish.match_rating_codex(list_n[indx[0]]),
                            jellyfish.match_rating_codex(list_n[indx[1]])) for indx in indx_l])
    indx_l = indx_l[numpy.where(rates >= 0.85)]
    # Find connected components 
    g = nx.Graph()
    for p in indx_l:
        g.add_edges_from(zip(p, p[1:]))
    g = nx.connected_components(g)
    g = list(g)
    #  
    return g