In [2]:
import json
import pandas as pd
import unicodedata
import re

In [3]:
def normalize_digits(affiliation):
    # Patterns for variations
    patterns = {
        r'\b(first|1st|1er|i)\b': '1',  # Variations of 1
        r'\b(second|2nd|ii)\b': '2',    # Variations of 2
        r'\b(fifth|5th|v)\b': '5'       # Variations of 5
    }
    
    normalized_affiliation = affiliation
    for pattern, replacement in patterns.items():
        # Replace matched patterns with their respective replacements
        normalized_affiliation = re.sub(pattern, replacement, normalized_affiliation)
    return normalized_affiliation

In [4]:
def remove_accents(text):
    # Normalize the text to decompose characters with accents
    normalized_text = unicodedata.normalize('NFKD', text)
    # Filter out the accents and keep only ASCII characters
    return ''.join(char for char in normalized_text if not unicodedata.combining(char))

In [5]:
# Exemple d'input
# input_text = "Département des Sciences de la Terre, Marrakech, B.P. 549, Morocco | aïn chock"
# output_text = remove_accents(input_text)
# print('outp:', output_text)

In [6]:
def generate_fr_affiliations_variations(affiliation_fr):
    variations = []
    
    return variations

def generate_en_affiliations_variations(affiliation_en):
    variations = []
    opt1 = 'faculty of sciences and techniques'
    opt2 = 'faculty of sciences'
    opt3 = 'polydisciplinary'

    if opt1 in affiliation_en:
        v1 = affiliation_en.replace(opt1, 'faculty of science and techniques')
        v2 = affiliation_en.replace(opt1, 'faculty of sciences and technology')
        v3 = affiliation_en.replace(opt1, 'faculty of science and technology')
        variations.extend([v1, v2, v3])
    
    elif opt2 in affiliation_en:
        v = affiliation_en.replace(opt2, 'faculty of science')
        variations.append(v)
    
    
    elif opt3 in affiliation_en:
        u = affiliation_en.replace(opt3, 'multidisciplinary')
        variations.append(u)

    return variations

In [47]:
def update_affiliation(df):

    updated_rows = []

    # FS Semlalia -> FS
    fs_semlalia = df.loc[df['id'] == 23].copy()
    fs_semlalia.loc[:, 'Abbreviation'] = 'FS'
    fs_semlalia.loc[:, 'Affiliation'] = 'Faculté des Sciences'
    fs_semlalia.loc[:, 'Affiliation En Name'] = 'Faculty of Sciences'
    updated_rows.append(fs_semlalia)


    # FS Dhar El Mahraz -> FS
    fs_fes = df.loc[df['id'] == 153].copy()
    fs_fes.loc[:, 'Abbreviation'] = 'FS'
    fs_fes.loc[:, 'Affiliation'] = 'Faculté des Sciences'
    fs_fes.loc[:, 'Affiliation En Name'] = 'Faculty of Sciences'
    updated_rows.append(fs_fes)


    # FST Sais -> FST
    fst_fes = df.loc[df['id'] == 155].copy()
    fst_fes.loc[:, 'Abbreviation'] = 'FST'
    fst_fes.loc[:, 'Affiliation'] = 'Faculté des Sciences et Techniques'
    fst_fes.loc[:, 'Affiliation En Name'] = 'Faculty of Sciences and Techniques'
    updated_rows.append(fst_fes)

    # Combine original DataFrame with the new rows
    updated_df = pd.concat([df] + updated_rows, ignore_index=True)

    return updated_df

In [None]:
affiliations_df = pd.read_csv('../data/transformed/affiliations.csv')
affiliations_df = update_affiliation(affiliations_df)
# affiliations_df.to_csv('output/aff.csv', index=False)

In [50]:
from collections import defaultdict

# Create defaultdicts with nested dictionary as the default value
affiliations_by_city = defaultdict(lambda: defaultdict(list))
universities_by_city = defaultdict(lambda: defaultdict(list))

for _, row in affiliations_df.iterrows():
    
    city = row['City'].strip()
    id = row['id']
    abbreviation = row['Abbreviation'].strip().lower()
    affiliation_en = row['Affiliation En Name'].strip().lower()
    affiliation_fr = row['Affiliation'].strip().lower()
    affiliation_fr = remove_accents(affiliation_fr)
    affiliation_fr = normalize_digits(affiliation_fr)
    affiliation_en = normalize_digits(affiliation_en)

    if row['Abbreviation'][0] == 'U':
        # University
        universities_by_city[city][id].append(abbreviation)
        universities_by_city[city][id].append(affiliation_fr)
        universities_by_city[city][id].append(affiliation_en)

        affiliation_en_2 = ' '.join([affiliation_en.rsplit(' ', 1)[1], affiliation_en.rsplit(' ', 1)[0]])
        affiliation_en_3 = ' of '.join([affiliation_en.rsplit(' ', 1)[1], affiliation_en.rsplit(' ', 1)[0]])
        universities_by_city[city][id].append(affiliation_en_2)
        universities_by_city[city][id].append(affiliation_en_3)
    else:
        # Affiliation
        affiliations_by_city[city][id].append(abbreviation)
        affiliations_by_city[city][id].append(affiliation_fr)
        affiliations_by_city[city][id].append(affiliation_en)
        # Add variations
        fr_variations = generate_fr_affiliations_variations(affiliation_fr)
        en_variations = generate_en_affiliations_variations(affiliation_en)
        affiliations_by_city[city][id].extend(fr_variations)
        affiliations_by_city[city][id].extend(en_variations)
        

In [53]:
# Convert defaultdict to a regular dictionary using `dict` and `defaultdict` comprehension
universities_by_city_dict = {k: dict(v) for k, v in universities_by_city.items()}
affiliations_by_city_dict = {k: dict(v) for k, v in affiliations_by_city.items()}

with open('mappers/affiliations_by_city.json', 'w', encoding='utf-8') as json_file:
    json.dump(affiliations_by_city, json_file, ensure_ascii=False, indent=4)

with open('mappers/universities_by_city.json', 'w', encoding='utf-8') as json_file:
    json.dump(universities_by_city, json_file, ensure_ascii=False, indent=4)

print("Data saved to JSON files!")

Data saved to JSON files!


In [11]:
import Levenshtein

# Define two strings
string1 = "university of moahmmed j"
string2 = "university of moahmmed i"

# Calculate Levenshtein distance
distance = Levenshtein.distance(string1, string2)

# Print the result
print(f"The Levenshtein distance between the strings is: {distance}")


The Levenshtein distance between the strings is: 1
