# <font size="8">Install Required Packages</font>

## Packages

In [1]:
!pip install pandas transformers unidecode transliterate langdetect metaphone gradio

Collecting unidecode
  Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)
Collecting transliterate
  Downloading transliterate-1.10.2-py2.py3-none-any.whl.metadata (14 kB)
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting metaphone
  Downloading Metaphone-0.6.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio
  Downloading gradio-5.3.0-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.3-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.2 (from gradio)
  Downloadi

## Install Arabic Transliterator

In [2]:
!git clone https://github.com/MTG/ArabicTransliterator.git

Cloning into 'ArabicTransliterator'...
remote: Enumerating objects: 659, done.[K
remote: Total 659 (delta 0), reused 0 (delta 0), pack-reused 659 (from 1)[K
Receiving objects: 100% (659/659), 22.03 MiB | 11.48 MiB/s, done.
Resolving deltas: 100% (369/369), done.
Updating files: 100% (210/210), done.


In [3]:
%cd ArabicTransliterator
!python3 setup.py install
!pip3 install -e .
#Renaming file to avoid duplications
!mv ArabicTransliterator.py transliterator.py

/content/ArabicTransliterator
running install
!!

        ********************************************************************************
        Please avoid running ``setup.py`` directly.
        Instead, use pypa/build, pypa/installer or other
        standards-based tools.

        See https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html for details.
        ********************************************************************************

!!
  self.initialize_options()
!!

        ********************************************************************************
        Please avoid running ``setup.py`` and ``easy_install``.
        Instead, use pypa/build, pypa/installer or other
        standards-based tools.

        See https://github.com/pypa/setuptools/issues/917 for details.
        ********************************************************************************

!!
  self.initialize_options()
running bdist_egg
running egg_info
creating arabictransliterator.egg-i

In [4]:
import ArabicTransliterator
from transliterator import ALA_LC_Transliterator
import mishkal.tashkeel.tashkeel as tashkeel
from text_unidecode import unidecode
#from pyarabic.araby import stripShadda, stripLastHaraka

transliterator = ALA_LC_Transliterator()

def transliterate_arabic(text, vocalize=True):
    voc = text
    if vocalize:
        vocalizer=tashkeel.TashkeelClass()
        voc = vocalizer.tashkeel(text)
        #voc = stripLastHaraka(stripShadda(vocalizer.tashkeel(text)))
    return unidecode(transliterator.do(voc.strip()))

## Import BERT

In [5]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

# Load BERT tokenizer and model for English only
bert_model_name = "bert-base-uncased"  # English-only BERT model
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
bert_model = AutoModel.from_pretrained(bert_model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

# <font size="8">Transliteration</font>

## Arabic Language

### Normailizing Transliterator Output

In [6]:
def apply_rules_ar(input_text):
    # Split the input text into words
    words = input_text.split()

    # Process each word according to the specified rules
    processed_words = []
    for word in words:
        # Rule 1: If the word starts with the Arabic letter (ا), double it
        if word.startswith('ا') or word.startswith('ع'):
            word = 'ا' + word

        # Rule 2: If the word contains the Arabic letter (ذ), replace it with (ز)
        word = word.replace('ذ', 'ز')

        # Rule 3: If the word contains the Arabic letter (ع), replace it with (أ)
        word = word.replace('ع', 'أأ')

        # Add the processed word to the list
        processed_words.append(word)

    # Reconstruct the text by joining the processed words
    return ' '.join(processed_words)

In [7]:
import re

def normalize_text_ar(text):
    # Convert to lowercase
    text = text.lower()
    # Replace apostrophes with 'a'
    text = text.replace("'", "a")
    # Replace any double letter at the beginning of a word with a single instance
    text = re.sub(r'\b(\w)\1', r'\1', text)
    # Remove any remaining special characters or numbers
    text = re.sub(r"[^a-zA-Z\s']", '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [8]:
def transliterate_function_ar(text):
  return normalize_text_ar(transliterate_arabic(apply_rules_ar(text)))

### Testing Transliterator

In [9]:
transliterate_function_ar('محمد هاني')

'muhammad hani'

## Latin Script Languages

### Function

In [10]:
from unidecode import unidecode

# Direct transliteration functions for languages similar to English
def transliterate_direct(name):
    return unidecode(name)

In [11]:
# List of languages similar to English that don't need translation
languages_similar_to_english = {
    'es': 'Spanish',
    'pt': 'Portuguese',
    'fr': 'French',
    'it': 'Italian',
    'de': 'German',
    'nl': 'Dutch',
    'sv': 'Swedish',
    'da': 'Danish',
    'no': 'Norwegian',
    'ro': 'Romanian',
    'pl': 'Polish',
    'cs': 'Czech',
    'sk': 'Slovak',
    'hu': 'Hungarian',
    'tr': 'Turkish',
    'vi': 'Vietnamese',
    'ms': 'Malay',
    'id': 'Indonesian'
}

### Test

In [12]:
# List of names with their languages and expected English versions
latin_with_translations = [
    ("José García", "Spanish", "Jose Garcia"),
    ("João Silva", "Portuguese", "Joao Silva"),
    ("François Dupont", "French", "Francois Dupont"),
    ("Giuseppe Rossi", "Italian", "Giuseppe Rossi"),
    ("Jürgen Müller", "German", "Jurgen Muller"),
    ("Maarten van Dijk", "Dutch", "Maarten van Dijk"),
    ("Björn Johansson", "Swedish", "Bjorn Johansson"),
    ("Søren Larsen", "Danish", "Soren Larsen"),
    ("Øyvind Hansen", "Norwegian", "Oyvind Hansen"),
    ("Ștefan Popescu", "Romanian", "Stefan Popescu"),
    ("Łukasz Nowak", "Polish", "Lukasz Nowak"),
    ("Jiří Novák", "Czech", "Jiri Novak"),
    ("Ľuboš Horváth", "Slovak", "Lubos Horvath"),
    ("Árpád Kovács", "Hungarian", "Arpad Kovacs"),
    ("Mehmet Yılmaz", "Turkish", "Mehmet Yilmaz"),
    ("Nguyễn Văn A", "Vietnamese", "Nguyen Van A"),
    ("Muhammad Amin", "Malay", "Muhammad Amin"),
    ("Putri Dewi", "Indonesian", "Putri Dewi")
]

In [13]:
import pandas as pd

# Create lists to store data for the DataFrame
languages = []
original_names = []
expected_english_versions = []
transliterated_outputs = []

# Loop through each name, perform the transliteration, and store in lists
for name, language, expected in latin_with_translations:
    transliterated_version = transliterate_direct(name)
    languages.append(language)
    original_names.append(name)
    expected_english_versions.append(expected)
    transliterated_outputs.append(transliterated_version)

# Create a Pandas DataFrame
df = pd.DataFrame({
    "Language": languages,
    "Original Name": original_names,
    "Expected English": expected_english_versions,
    "Transliterated Output": transliterated_outputs
})

In [14]:
# Display the DataFrame
print(df)

      Language     Original Name  Expected English Transliterated Output
0      Spanish       José García       Jose Garcia           Jose Garcia
1   Portuguese        João Silva        Joao Silva            Joao Silva
2       French   François Dupont   Francois Dupont       Francois Dupont
3      Italian    Giuseppe Rossi    Giuseppe Rossi        Giuseppe Rossi
4       German     Jürgen Müller     Jurgen Muller         Jurgen Muller
5        Dutch  Maarten van Dijk  Maarten van Dijk      Maarten van Dijk
6      Swedish   Björn Johansson   Bjorn Johansson       Bjorn Johansson
7       Danish      Søren Larsen      Soren Larsen          Soren Larsen
8    Norwegian     Øyvind Hansen     Oyvind Hansen         Oyvind Hansen
9     Romanian    Ștefan Popescu    Stefan Popescu        Stefan Popescu
10      Polish      Łukasz Nowak      Lukasz Nowak          Lukasz Nowak
11       Czech        Jiří Novák        Jiri Novak            Jiri Novak
12      Slovak     Ľuboš Horváth     Lubos Horvath 

## Cyrillic Script Languages

### Transliteration function

In [15]:
from transliterate import translit

# Transliterate for Cyrillic languages using transliterate library
def transliterate_cyrillic(name):
  '''
    try:
      # Transliterate the input name
      transliterated_name = translit(name, reversed=True)
      return unidecode(transliterated_name)
  '''
  return unidecode(name)

In [16]:
#List of cyrillic languagues
cyrillic_languagues = {
    'cyrillic': 'cyrillic script',
    'ru': 'Russian',
    'uk': 'Ukrainian',
    'bg': 'Bulgarian',
    'mn': 'Mongolian',
    'mk': 'Macedonian',
    'sr': 'Serbian',
    'el': 'Greek',
    'hy': 'Armenian'
}

### Test

In [17]:
# List of names with their languages and expected English versions
cyrillic_with_translations = [
    ("Алексей Иванов", "Russian", "Aleksei Ivanov"),
    ("Олександр Шевченко", "Ukrainian", "Oleksandr Shevchenko"),
    ("Георги Петров", "Bulgarian", "Georgi Petrov"),
    ("Батбаяр Сэргэлэн", "Mongolian", "Batbayar Sergelen"),
    ("Александар Стојанов", "Macedonian", "Aleksandar Stojanov"),
    ("Никола Тесла", "Serbian", "Nikola Tesla"),
    ("Νικόλαος Παπαδόπουλος", "Greek", "Nikolaos Papadopoulos"),
    ("Արամ Մարտիրոսյան", "Armenian", "Aram Martirosyan")
]

In [18]:
# Create lists to store data for the DataFrame
languages = []
original_names = []
expected_english_versions = []
transliterated_outputs = []

# Loop through each name, perform the transliteration, and store in lists
for name, language, expected in cyrillic_with_translations:
    transliterated_version = transliterate_cyrillic(name)
    languages.append(language)
    original_names.append(name)
    expected_english_versions.append(expected)
    transliterated_outputs.append(transliterated_version)

# Create a Pandas DataFrame
df = pd.DataFrame({
    "Language": languages,
    "Original Name": original_names,
    "Expected English": expected_english_versions,
    "Transliterated Output": transliterated_outputs
})

In [19]:
# Display the DataFrame
print(df)

     Language          Original Name       Expected English  \
0     Russian         Алексей Иванов         Aleksei Ivanov   
1   Ukrainian     Олександр Шевченко   Oleksandr Shevchenko   
2   Bulgarian          Георги Петров          Georgi Petrov   
3   Mongolian       Батбаяр Сэргэлэн      Batbayar Sergelen   
4  Macedonian    Александар Стојанов    Aleksandar Stojanov   
5     Serbian           Никола Тесла           Nikola Tesla   
6       Greek  Νικόλαος Παπαδόπουλος  Nikolaos Papadopoulos   
7    Armenian       Արամ Մարտիրոսյան       Aram Martirosyan   

   Transliterated Output  
0         Aleksei Ivanov  
1   Oleksandr Shevchenko  
2          Georgi Petrov  
3      Batbaiar Sergelen  
4    Aleksandar Stojanov  
5           Nikola Tesla  
6  Nikolaos Papadopoulos  
7       Aram Martirosyan  


## Other Special Script Languages (Work in Progress)

In [20]:
# List of special languages with transliteration functions
special_languages = {
    'ar': 'Arabic',
}

unsupported_special_languages = {
    'zh': 'Chinese',
    'ko': 'Korean',
    'ja': 'Japanese',
    'hi': 'Hindi',
    'bn': 'Bengali'
}

In [21]:
'''
from pypinyin import pinyin, Style
from jamo import h2j, j2hcj
from korean_romanizer.romanizer import Romanizer
import romkan
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate

# Transliteration functions

# Arabic
def transliterate_function_ar(name):
    # Transliterate Arabic name to Latin script
    return normalize_text_ar(transliterate_only(name))

# Chinese
def transliterate_function_zh(name):
    # Transliterate Chinese name to Pinyin without tone numbers
    pinyin_list = pinyin(name, style=Style.NORMAL)
    # Join the Pinyin list into a single string with spaces
    transliterated_name = ' '.join([''.join(word) for word in pinyin_list])
    return transliterated_name

# Korean
def transliterate_function_ko(name):
    # Use korean-romanizer to transliterate Korean to Latin script
    r = Romanizer(name)
    transliterated_name = r.romanize()
    return transliterated_name

# Japanese
def transliterate_function_ja(name):
    # Use romkan to transliterate Japanese Kana to Romaji
    transliterated_name = romkan.to_roma(name)
    return transliterated_name

# Hindi
def transliterate_function_hi(name):
    # Use indic_transliteration to transliterate Hindi to Latin script
    transliterated_name = transliterate(name, sanscript.DEVANAGARI, sanscript.ITRANS)
    return transliterated_name

# Bengali
def transliterate_function_bn(name):
    # Use indic_transliteration to transliterate Bengali to Latin script
    transliterated_name = transliterate(name, sanscript.BENGALI, sanscript.ITRANS)
    return transliterated_name

'''

"\nfrom pypinyin import pinyin, Style\nfrom jamo import h2j, j2hcj\nfrom korean_romanizer.romanizer import Romanizer\nimport romkan\nfrom indic_transliteration import sanscript\nfrom indic_transliteration.sanscript import transliterate\n\n# Transliteration functions\n\n# Arabic\ndef transliterate_function_ar(name):\n    # Transliterate Arabic name to Latin script\n    return normalize_text_ar(transliterate_only(name))\n\n# Chinese\ndef transliterate_function_zh(name):\n    # Transliterate Chinese name to Pinyin without tone numbers\n    pinyin_list = pinyin(name, style=Style.NORMAL)\n    # Join the Pinyin list into a single string with spaces\n    transliterated_name = ' '.join([''.join(word) for word in pinyin_list])\n    return transliterated_name\n\n# Korean\ndef transliterate_function_ko(name):\n    # Use korean-romanizer to transliterate Korean to Latin script\n    r = Romanizer(name)\n    transliterated_name = r.romanize()\n    return transliterated_name\n\n# Japanese\ndef transl

In [22]:
# Special transliteration functions for languages not similar to English

def special_transliterate(name, language_code):
    # Define a dictionary mapping language codes to their respective transliteration functions
    transliteration_functions = {
        'ar': transliterate_function_ar  # Arabic
    }

    # Add more language codes and functions as needed
    '''
    'zh': transliterate_function_zh,  # Chinese
    'ko': transliterate_function_ko,  # Korean
    'ja': transliterate_function_ja,   # Japanese
    'hi': transliterate_function_hi,  # Hindi
    'bn': transliterate_function_bn  # Bengali
    '''

    # Select the appropriate transliteration function based on the language code
    transliterate_func = transliteration_functions.get(language_code.lower())

    # Check if a valid function was found
    if transliterate_func:
        return transliterate_func(name)
    else:
        raise ValueError(f"Transliteration function for language code '{language_code}' not found.")

## Combined Pipeline

### List of Supported Languages

In [23]:
# Combine all lists into one list of supported languages
list_of_supported_languages = {**special_languages,**languages_similar_to_english,**cyrillic_languagues}
print('Number of Supported Languages:' , len(list_of_supported_languages))

Number of Supported Languages: 28


In [24]:
list_of_supported_languages.values()

dict_values(['Arabic', 'Spanish', 'Portuguese', 'French', 'Italian', 'German', 'Dutch', 'Swedish', 'Danish', 'Norwegian', 'Romanian', 'Polish', 'Czech', 'Slovak', 'Hungarian', 'Turkish', 'Vietnamese', 'Malay', 'Indonesian', 'cyrillic script', 'Russian', 'Ukrainian', 'Bulgarian', 'Mongolian', 'Macedonian', 'Serbian', 'Greek', 'Armenian'])

### Detect Language

In [25]:
from langdetect import detect, DetectorFactory

# Ensure consistent language detection
DetectorFactory.seed = 0

def detect_script(text):
    if re.search(r'[\u0600-\u06ff]', text):
        return 'ar'
    elif re.search(r'[\u4e00-\u9fff]', text):
        return 'zh'
    elif re.search(r'[\u3040-\u309f]', text) or re.search(r'[\u30a0-\u30ff]', text):
        return 'ja'
    elif re.search(r'[\uac00-\ud7af]', text):
        return 'ko'
    elif re.search(r'[\u0400-\u04ff]', text):
        return 'cyrillic'
    else:
        return detect(text)

In [26]:

### Main Function to Detect Language and Apply the Correct Pipeline

def detect_language_pipeline(name):
    detected_language = detect_script(name)

    if detected_language in languages_similar_to_english :
      # Direct transliteration
      return transliterate_direct(name)

    elif detected_language in cyrillic_languagues :
      # Cyrillic transliteration
      return transliterate_cyrillic(name)

    elif detected_language in special_languages :
      # Special treatment for non-similar languages
      return special_transliterate(name, detected_language)

    else:
      # Direct transliteration
      return transliterate_direct(name)
      #raise ValueError(f"Translation model and tokenizer for country code '{detected_language}' not found.")

### Test

In [27]:
detect_script('أحمد ماجد')

'ar'

In [28]:
detect_language_pipeline('مصطفى رؤوف')

'mustafa raauf'

In [29]:
# List of names with their languages and expected English versions
special_with_translations = [
    ("محمد هاني", "Arabic", "Mohamed Hany")
]

# Combine all lists into one
names_with_translations = special_with_translations + latin_with_translations + cyrillic_with_translations

In [30]:
import pandas as pd

# Create lists to store data for the DataFrame
languages = []
original_names = []
expected_english_versions = []
transliterated_outputs = []

# Loop through each name, perform the transliteration, and store in lists
for name, language, expected in names_with_translations:
    transliterated_version = detect_language_pipeline(name)
    languages.append(language)
    original_names.append(name)
    expected_english_versions.append(expected)
    transliterated_outputs.append(transliterated_version)

# Create a Pandas DataFrame
df = pd.DataFrame({
    "Language": languages,
    "Original Name": original_names,
    "Expected English": expected_english_versions,
    "Transliterated Output": transliterated_outputs
})

In [31]:
# Display the DataFrame
df

Unnamed: 0,Language,Original Name,Expected English,Transliterated Output
0,Arabic,محمد هاني,Mohamed Hany,muhammad hani
1,Spanish,José García,Jose Garcia,Jose Garcia
2,Portuguese,João Silva,Joao Silva,Joao Silva
3,French,François Dupont,Francois Dupont,Francois Dupont
4,Italian,Giuseppe Rossi,Giuseppe Rossi,Giuseppe Rossi
5,German,Jürgen Müller,Jurgen Muller,Jurgen Muller
6,Dutch,Maarten van Dijk,Maarten van Dijk,Maarten van Dijk
7,Swedish,Björn Johansson,Bjorn Johansson,Bjorn Johansson
8,Danish,Søren Larsen,Soren Larsen,Soren Larsen
9,Norwegian,Øyvind Hansen,Oyvind Hansen,Oyvind Hansen


# <font size="8">Matching</font>

## BERT Embeddings Matching

In [32]:
def get_bert_token_embeddings(name):
    # Tokenize and encode the name
    inputs = bert_tokenizer(name, return_tensors="pt", padding=True, truncation=True)

    # Pass through BERT model
    with torch.no_grad():
        outputs = bert_model(**inputs)

    # Extract the embeddings from the last hidden state
    embeddings = outputs.last_hidden_state.squeeze(0)  # Shape: (sequence_length, hidden_size)
    return embeddings, inputs.input_ids.squeeze(0)  # Return both embeddings and input IDs

def position_aware_similarity(name1, name2):
    # Normalize the input names to lowercase
    name1 = name1.lower()
    name2 = name2.lower()

    # Get token embeddings and input IDs for both names
    embeddings1, input_ids1 = get_bert_token_embeddings(name1)
    embeddings2, input_ids2 = get_bert_token_embeddings(name2)

    # Compute minimum length to compare up to the shorter name
    min_length = min(len(input_ids1), len(input_ids2))

    # Compute cosine similarity for each token at the same position
    similarities = F.cosine_similarity(embeddings1[:min_length], embeddings2[:min_length], dim=1)

    # Compute overall similarity as the mean of token-wise similarities
    overall_similarity = similarities.mean().item()

    # Apply penalties for length and order differences
    length_penalty = abs(len(input_ids1) - len(input_ids2)) / max(len(input_ids1), len(input_ids2))
    order_penalty = (1 - similarities.mean()).item()  # Penalty for order mismatch

    # Combine similarity with penalties
    adjusted_similarity = overall_similarity - length_penalty - order_penalty

    return max(0, adjusted_similarity)  # Ensure similarity does not go below 0

In [33]:
def bert_match(name1, name2):

    # Normalize the input names to lowercase
    name1 = name1.lower()
    name2 = name2.lower()


    # Get position-aware similarity between the names
    similarity = position_aware_similarity(name1, name2)
    output = round(similarity, 2)

    # Interpret the result
    if similarity > 0.75:
        return True, output
    else:
        return False, output


In [34]:
# Create lists to store data for the DataFrame
languages = []
original_names = []
expected_english_versions = []
transliterated_outputs = []
match_result = []
match_scores = []

# Loop through each name, perform the transliteration, and store in lists
for name, language, expected in names_with_translations:
    transliterated_version = detect_language_pipeline(name)
    match_output, match_score = bert_match(expected, transliterated_version)

    languages.append(language)
    original_names.append(name)
    expected_english_versions.append(expected)
    transliterated_outputs.append(transliterated_version)
    match_result.append(match_output)
    match_scores.append(match_score)


# Create a Pandas DataFrame
df = pd.DataFrame({
    "Language": languages,
    "Original Name": original_names,
    "Expected English": expected_english_versions,
    "Transliterated Output": transliterated_outputs,
    "Match Result": match_result,
    "Match Score": match_scores
})

In [35]:
df

Unnamed: 0,Language,Original Name,Expected English,Transliterated Output,Match Result,Match Score
0,Arabic,محمد هاني,Mohamed Hany,muhammad hani,False,0.51
1,Spanish,José García,Jose Garcia,Jose Garcia,True,1.0
2,Portuguese,João Silva,Joao Silva,Joao Silva,True,1.0
3,French,François Dupont,Francois Dupont,Francois Dupont,True,1.0
4,Italian,Giuseppe Rossi,Giuseppe Rossi,Giuseppe Rossi,True,1.0
5,German,Jürgen Müller,Jurgen Muller,Jurgen Muller,True,1.0
6,Dutch,Maarten van Dijk,Maarten van Dijk,Maarten van Dijk,True,1.0
7,Swedish,Björn Johansson,Bjorn Johansson,Bjorn Johansson,True,1.0
8,Danish,Søren Larsen,Soren Larsen,Soren Larsen,True,1.0
9,Norwegian,Øyvind Hansen,Oyvind Hansen,Oyvind Hansen,True,1.0


## Phonetics Matching

In [36]:
from metaphone import doublemetaphone

def metaphone_encode(name):
    # Encode the name using Double Metaphone and select the primary encoding
    return ' '.join(doublemetaphone(word)[0] for word in name.split())

def compare_name(name1, name2):
    # Compare if two names match phonetically
    return metaphone_encode(name1) == metaphone_encode(name2)

In [37]:
# Create lists to store data for the DataFrame
languages = []
original_names = []
expected_english_versions = []
encoded_expected = []
transliterated_outputs = []
encoded_outputs = []
match_result = []

# Loop through each name, perform the transliteration, and store in lists
for name, language, expected in names_with_translations:
    encoded_original = metaphone_encode(expected)
    transliterated_version = detect_language_pipeline(name)
    encoded_transliterated = metaphone_encode(transliterated_version)
    match_output = compare_name(encoded_original, encoded_transliterated)

    languages.append(language)
    original_names.append(name)
    expected_english_versions.append(expected)
    encoded_expected.append(encoded_original)
    transliterated_outputs.append(transliterated_version)
    encoded_outputs.append(encoded_transliterated)
    match_result.append(match_output)


# Create a Pandas DataFrame
df = pd.DataFrame({
    "Language": languages,
    "Original Name": original_names,
    "Expected English": expected_english_versions,
    "Encoded Expected": encoded_expected,
    "Transliterated Output": transliterated_outputs,
    "Encoded Transliterated": encoded_outputs,
    "Match Result": match_result
})

In [38]:
df

Unnamed: 0,Language,Original Name,Expected English,Encoded Expected,Transliterated Output,Encoded Transliterated,Match Result
0,Arabic,محمد هاني,Mohamed Hany,MHMT HN,muhammad hani,MHMT HN,True
1,Spanish,José García,Jose Garcia,JS KRS,Jose Garcia,JS KRS,True
2,Portuguese,João Silva,Joao Silva,J SLF,Joao Silva,J SLF,True
3,French,François Dupont,Francois Dupont,FRNK TPNT,Francois Dupont,FRNK TPNT,True
4,Italian,Giuseppe Rossi,Giuseppe Rossi,JSP RS,Giuseppe Rossi,JSP RS,True
5,German,Jürgen Müller,Jurgen Muller,JRJN MLR,Jurgen Muller,JRJN MLR,True
6,Dutch,Maarten van Dijk,Maarten van Dijk,MRTN FN TK,Maarten van Dijk,MRTN FN TK,True
7,Swedish,Björn Johansson,Bjorn Johansson,PJRN JHNSN,Bjorn Johansson,PJRN JHNSN,True
8,Danish,Søren Larsen,Soren Larsen,SRN LRSN,Soren Larsen,SRN LRSN,True
9,Norwegian,Øyvind Hansen,Oyvind Hansen,AFNT HNSN,Oyvind Hansen,AFNT HNSN,True


## Combined Matching

In [39]:
def combined_match(name1, name2):

    # Normalize the input names to metaphone
    name1 = metaphone_encode(name1)
    name2 = metaphone_encode(name2)

    # Get position-aware similarity between the names
    similarity = position_aware_similarity(name1, name2)
    output = round(similarity, 2)

    # Interpret the result
    if similarity > 0.75:
        return True, output
    else:
        return False, output


In [40]:
# Create lists to store data for the DataFrame
languages = []
original_names = []
expected_english_versions = []
encoded_expected = []
transliterated_outputs = []
encoded_outputs = []
match_result = []
match_scores = []

# Loop through each name, perform the transliteration, and store in lists
for name, language, expected in names_with_translations:
    encoded_original = metaphone_encode(expected)
    transliterated_version = detect_language_pipeline(name)
    encoded_transliterated = metaphone_encode(transliterated_version)
    match_output, match_score = combined_match(encoded_original, encoded_transliterated)
    #Fill the df
    languages.append(language)
    original_names.append(name)
    expected_english_versions.append(expected)
    encoded_expected.append(encoded_original)
    transliterated_outputs.append(transliterated_version)
    encoded_outputs.append(encoded_transliterated)
    match_result.append(match_output)
    match_scores.append(match_score)


# Create a Pandas DataFrame
df = pd.DataFrame({
    "Language": languages,
    "Original Name": original_names,
    "Expected English": expected_english_versions,
    "Encoded Expected": encoded_expected,
    "Transliterated Output": transliterated_outputs,
    "Encoded Transliterated": encoded_outputs,
    "Match Result": match_result,
    "Match Score": match_scores
})

In [41]:
df

Unnamed: 0,Language,Original Name,Expected English,Encoded Expected,Transliterated Output,Encoded Transliterated,Match Result,Match Score
0,Arabic,محمد هاني,Mohamed Hany,MHMT HN,muhammad hani,MHMT HN,True,1.0
1,Spanish,José García,Jose Garcia,JS KRS,Jose Garcia,JS KRS,True,1.0
2,Portuguese,João Silva,Joao Silva,J SLF,Joao Silva,J SLF,True,1.0
3,French,François Dupont,Francois Dupont,FRNK TPNT,Francois Dupont,FRNK TPNT,True,1.0
4,Italian,Giuseppe Rossi,Giuseppe Rossi,JSP RS,Giuseppe Rossi,JSP RS,True,1.0
5,German,Jürgen Müller,Jurgen Muller,JRJN MLR,Jurgen Muller,JRJN MLR,True,1.0
6,Dutch,Maarten van Dijk,Maarten van Dijk,MRTN FN TK,Maarten van Dijk,MRTN FN TK,True,1.0
7,Swedish,Björn Johansson,Bjorn Johansson,PJRN JHNSN,Bjorn Johansson,PJRN JHNSN,True,1.0
8,Danish,Søren Larsen,Soren Larsen,SRN LRSN,Soren Larsen,SRN LRSN,True,1.0
9,Norwegian,Øyvind Hansen,Oyvind Hansen,AFNT HNSN,Oyvind Hansen,AFNT HNSN,True,1.0


# <font size="8">Input Processing</font>

## Text Normalization

In [42]:
#using Regular Expression to normalize the names
import re
def normalize_text(text):
    # Check if the input name is not in English
    if not text.isascii():
      return text
    else:
      # Convert to lowercase
      text = text.lower()
      # Remove special characters and numbers
      text = re.sub(r'[^a-zA-Z\s]', '', text)
      # Remove extra spaces
      text = re.sub(r'\s+', ' ', text).strip()
      return text

## Handling Last Names

In [43]:
def rearrange_name(name):
    # Check if the input name is not in English
    if not name.isascii():
      return name
    else:
      if ',' in name:
          last_name, first_names = name.split(',', 1)
          rearranged_name = f"{first_names.strip()} {last_name.strip()}"
      else:
          rearranged_name = name.strip()
      return rearranged_name

input_name = "Loutfi, Ahmed Maged"
corrected_name = rearrange_name(input_name)
print("Rearranged name:", corrected_name)

Rearranged name: Ahmed Maged Loutfi


In [44]:
def print_ex(name1, name2):
    # Rearrange if necessary
    r_name1 = rearrange_name(name1)
    r_name2 = rearrange_name(name2)
    # Translate if necessary
    translated_name1 = detect_language_pipeline(r_name1)
    translated_name2 = detect_language_pipeline(r_name2)

    # Print statements
    print(f" Input Name: {name1}\n", f"Rearranged: {r_name1}\n", f"Script: {detect_script(r_name1)}\n", f"Translated: {translated_name1}\n",  f"Encoded: {metaphone_encode(translated_name1)}\n")
    print(f" Banned Name: {name2}\n", f"Rearranged: {r_name2}\n", f"Script: {detect_script(r_name2)}\n", f"Translated: {translated_name2}\n", f"Encoded: {metaphone_encode(translated_name2)}\n")

    # Comparison
    is_match = compare_name(translated_name1, translated_name2)
    print(f" Match Result: {is_match}")

In [45]:
print_ex('Андрей  Иванов',
         'اندري ايفانوف	')

 Input Name: Андрей  Иванов
 Rearranged: Андрей  Иванов
 Script: cyrillic
 Translated: Andrei  Ivanov
 Encoded: ANTR AFNF

 Banned Name: اندري ايفانوف	
 Rearranged: اندري ايفانوف	
 Script: ar
 Translated: andri ayfanuf
 Encoded: ANTR AFNF

 Match Result: True


# <font size="8">Banned Names</font>

In [46]:
# Predefined list of banned names
banned_names = [row[-1] for row in names_with_translations]
processed_banned_names = [detect_language_pipeline(normalize_text(rearrange_name(name))) for name in banned_names]
processed_banned_names

['mohamed hany',
 'jose garcia',
 'joao silva',
 'francois dupont',
 'giuseppe rossi',
 'jurgen muller',
 'maarten van dijk',
 'bjorn johansson',
 'soren larsen',
 'oyvind hansen',
 'stefan popescu',
 'lukasz nowak',
 'jiri novak',
 'lubos horvath',
 'arpad kovacs',
 'mehmet yilmaz',
 'nguyen van a',
 'muhammad amin',
 'putri dewi',
 'aleksei ivanov',
 'oleksandr shevchenko',
 'georgi petrov',
 'batbayar sergelen',
 'aleksandar stojanov',
 'nikola tesla',
 'nikolaos papadopoulos',
 'aram martirosyan']

# <font size="8">GUI Functions</font>

## Combined Matching

In [47]:
def is_match_combined(input):
    input_name = detect_language_pipeline(normalize_text(rearrange_name(input)))
    results = []
    # If single input name, compare it to predefined banned names
    for name in banned_names:
        banned_name = detect_language_pipeline(normalize_text(rearrange_name(name)))
        is_match_flag, match_score = combined_match(input_name, banned_name)
        results.append([name, match_score, is_match_flag])

    # Get the highest matching score from the predefined list
    best_match = max(results, key=lambda x: x[1])

    # Return the best match result and whether it passed the threshold
    return best_match[2],best_match[:2]

In [48]:
def match_names_combined(input_name, csv_file=None):
  if input_name:
    # Return the best match result and whether it passed the threshold
    flag, output = is_match_combined(input_name)
    if flag:
      return f"Match Found.\nName: {output[0]}.", None
    else:
      return f"No Match Found.", None

  if csv_file:
    results = []
    names_df = pd.read_csv(csv_file)  # Read the CSV directly from the uploaded file object
    for name in names_df['name']:
        bol, match_result = is_match_combined(name)  # Use is_match function for each name
        results.append([name, bol, match_result[0]])

    # Save results as CSV
    result_df = pd.DataFrame(results, columns=["Name", "Match Result", "Best Match"])
    result_csv = "/tmp/result.csv"
    result_df.to_csv(result_csv, index=False)

    # Count how many True matches
    true_matches = result_df['Match Result'].sum()
    return f"Total Matches Found: {true_matches}", result_csv

In [49]:
import gradio as gr

# Define the Gradio interface
interface_combined = gr.Interface(
    fn=match_names_combined,
    inputs=[
        gr.Textbox(lines=1, placeholder="Enter a name", label="Input Name (leave blank if uploading CSV)"),
        gr.File(label="Upload CSV with Names")
    ],
    outputs=[
        gr.Textbox(label="Output"),  # For text input output
        gr.File(label="Download Result CSV")  # For CSV output
    ],
    title="Name Matcher Model",
    description="Enter a name or upload a CSV file with names to check if there is a match against the banned names."
)

## Phonetics Matching

In [50]:
def double_metaphone_encoding(name):
    # Get primary and secondary encodings from Double Metaphone
    return doublemetaphone(name)

def compare_double_metaphone(name1, name2):
    # Get Double Metaphone encodings for both names
    name1_primary, name1_secondary = double_metaphone_encoding(name1)
    name2_primary, name2_secondary = double_metaphone_encoding(name2)

    # Initialize match score
    score = 0.0

    # Compare primary encodings
    if name1_primary == name2_primary:
        score += 1.0  # Exact primary match

    # Compare secondary encodings (only if primary doesn't match)
    elif name1_secondary == name2_secondary:
        score += 0.75  # Secondary match

    return score

def compare_full_name(full_name1, full_name2):
    # Split names into parts (assuming format: First Middle Last)
    name_parts1 = full_name1.split()
    name_parts2 = full_name2.split()

    # Make sure both names have the same number of parts (add empty strings if necessary)
    while len(name_parts1) < len(name_parts2):
        name_parts1.append("")
    while len(name_parts2) < len(name_parts1):
        name_parts2.append("")

    total_score = 0.0
    parts_count = len(name_parts1)  # Number of name parts

    # Compare each part (first, middle, last names) using Double Metaphone
    for part1, part2 in zip(name_parts1, name_parts2):
        total_score += compare_double_metaphone(part1, part2)

    # Calculate average match score
    final_score = total_score / parts_count

    return final_score

In [51]:
def is_match(input):
    input_name = detect_language_pipeline(normalize_text(rearrange_name(input)))
    results = []
    # If single input name, compare it to predefined banned names
    for name in banned_names:
        banned_name = detect_language_pipeline(normalize_text(rearrange_name(name)))
        match_score = compare_full_name(input_name, banned_name)
        is_match_flag = match_score >= 0.9
        results.append([name, match_score, is_match_flag])

    # Get the highest matching score from the predefined list
    best_match = max(results, key=lambda x: x[1])

    # Return the best match result and whether it passed the threshold
    return best_match[2],best_match[:2]

In [52]:
def match_names(input_name, csv_file=None):
  if input_name:
    # Return the best match result and whether it passed the threshold
    flag, output = is_match(input_name)
    if flag:
      return f"Match Found.\nName: {output[0]}.", None
    else:
      return f"No Match Found.", None

  if csv_file:
    results = []
    names_df = pd.read_csv(csv_file)  # Read the CSV directly from the uploaded file object
    for name in names_df['name']:
        bol, match_result = is_match(name)  # Use is_match function for each name
        results.append([name, bol, match_result[0]])

    # Save results as CSV
    result_df = pd.DataFrame(results, columns=["Name", "Match Result", "Best Match"])
    result_csv = "/tmp/result.csv"
    result_df.to_csv(result_csv, index=False)

    # Count how many True matches
    true_matches = result_df['Match Result'].sum()
    return f"Total Matches Found: {true_matches}", result_csv

In [53]:
import gradio as gr

# Define the Gradio interface
interface = gr.Interface(
    fn=match_names,
    inputs=[
        gr.Textbox(lines=1, placeholder="Enter a name", label="Input Name (leave blank if uploading CSV)"),
        gr.File(label="Upload CSV with Names")
    ],
    outputs=[
        gr.Textbox(label="Output"),  # For text input output
        gr.File(label="Download Result CSV")  # For CSV output
    ],
    title="Name Matcher Model",
    description="Enter a name or upload a CSV file with names to check if there is a match against the banned names."
)

# <font size="8">Gradio Interface & API (Combined Matching)</font>
# <font size="5">## More Accurate but too much processing time >> Not suitable for production level</font>


In [54]:
# Launch the interface
interface_combined.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://12f07a8b55846a7d84.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [55]:
[row[0] for row in names_with_translations]

['محمد هاني',
 'José García',
 'João Silva',
 'François Dupont',
 'Giuseppe Rossi',
 'Jürgen Müller',
 'Maarten van Dijk',
 'Björn Johansson',
 'Søren Larsen',
 'Øyvind Hansen',
 'Ștefan Popescu',
 'Łukasz Nowak',
 'Jiří Novák',
 'Ľuboš Horváth',
 'Árpád Kovács',
 'Mehmet Yılmaz',
 'Nguyễn Văn A',
 'Muhammad Amin',
 'Putri Dewi',
 'Алексей Иванов',
 'Олександр Шевченко',
 'Георги Петров',
 'Батбаяр Сэргэлэн',
 'Александар Стојанов',
 'Никола Тесла',
 'Νικόλαος Παπαδόπουλος',
 'Արամ Մարտիրոսյան']

# <font size="8">Gradio Interface & API (Phonetics Matching)</font>
# <font size="5">## Slightly Less Accurate but much less processing time >> Suitable for production level</font>

In [56]:
# Launch the interface
interface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://e08b9e949cce89e9d5.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [64]:
from gradio_client import Client, handle_file

client = Client("https://e08b9e949cce89e9d5.gradio.live/")
result = client.predict(
		input_name="  اليكسي ايفانوف ",
		csv_file=None,
		api_name="/predict"
)
print(result)

Loaded as API: https://e08b9e949cce89e9d5.gradio.live/ ✔
('Match Found.\nName: Aleksei Ivanov.', None)
