#### Read csv

In [2]:
import pandas as pd
import re

In [3]:
df = pd.read_csv('data/test.csv')

In [4]:
df.columns

Index(['Authors', 'Author full names', 'Author(s) ID', 'Title', 'Year',
       'Source title', 'Cited by', 'DOI', 'Link', 'Affiliations',
       'Authors with affiliations', 'Author Keywords', 'Index Keywords',
       'Editors', 'Publisher', 'ISSN', 'ISBN', 'CODEN', 'PubMed ID',
       'Language of Original Document', 'Document Type', 'Source', 'EID'],
      dtype='object')

In [7]:
# Function to extract IDs between parentheses
def extract_ids(text):
    return re.findall(r'\((\d+)\)', text)

In [45]:
def duplicate_rows_by_author(df):
    new_df = pd.DataFrame(columns=["author_id", "author_name", "affiliation_full_name", "country"])

    for _, row in df.iterrows():
        authors_with_ids = row["Author full names"].split(';')
        authors_with_affiliations = row["Affiliations"].split(';')
        ids, author_names, affiliations, countries = [], [], [], []
        
        for auth_id_name, aff in zip(authors_with_ids, authors_with_affiliations):
            # extract author name
            name_pattern = r"[A-Za-záéíóúÁÉÍÓÚñÑ]+,\s[A-Za-záéíóúÁÉÍÓÚñÑ]+"
            name_match = re.search(name_pattern, auth_id_name)
            # author_name = auth_id.split('.')[0].strip()

            # extract id
            id_match = re.search(r'\((\d+)\)', auth_id_name)
            if id_match and name_match:
                author_id = id_match.group(1)
                author_name = name_match.group()
            else:
                continue

            # extract country
            country = aff.rsplit(',', 1)[1].strip().lower()
            if country not in ["morocco", "maroc"]:
                continue

            ids.append(author_id)
            author_names.append(author_name)
            affiliations.append(aff)
            countries.append(country)

        data = pd.DataFrame({
            "author_id": ids,
            "author_name": author_names,
            "affiliation_full_name": affiliations,
            "country": countries
        })

        new_df = pd.concat([new_df, data], ignore_index=True)
        
    return new_df

In [47]:
trans_df = duplicate_rows_by_author(df)
trans_df.to_csv("metadata.csv", index=False)
trans_df

Unnamed: 0,author_id,author_name,affiliation_full_name,country
0,23004567900,"Khachane, M","Laboratoire de Chimie du Solide Minéral, dépar...",morocco
1,6701835701,"Villain, S",Laboratoire de physique du Solide et couches ...,morocco
2,12782205700,"Mjahed, Mostafa","Ecole Royale de l'Air, Maths and Systems Dept....",morocco
3,56698346700,"Sáenz, L","Departement de Biologie, Université Ibn Tofaï...",morocco
4,56615820600,"Bahi, S","Mohammed v University, Faculty of Sciences, Ra...",morocco
...,...,...,...,...
150,23978044400,"Elghazi, Haddou","Solid State Physics Laboratory, Faculty of Sci...",morocco
151,7006857659,"Jorio, Anouar","Regional Center of Interface, Sidi Mohammed B...",morocco
152,12239828500,"Ouarsal, Rachid",Laboratoire DIngénierie des Matériaux Organom...,morocco
153,8415913200,"Bali, Brahim","Department of Chemistry, Faculty of Sciences,...",morocco


In [7]:
affiliations_df = pd.read_csv('data/Universities-Affiliations/Moroccan-Affiliations.csv')

In [26]:
french_cit_names = affiliations_df['City'].apply(lambda x: x.strip()).unique().tolist()

In [25]:
english_city_names = [
    "Tetouan",
    "Tangier",
    "Larache",
    "Ksar el-Kebir",
    "Al Hoceima",
    "Marrakesh",
    "El Kelaa of Sraghna",
    "Safi",
    "Essaouira",
    "El Jadida",
    "Sidi Bennour",
    "Settat",
    "Berrechid",
    "Casablanca",
    "Mohammedia",
    "Kenitra",
    "Agadir",
    "Ait Melloul",
    "Guelmim",
    "Ouarzazate",
    "Taroudant",
    "Smara",
    "Laayoune",
    "Dakhla",
    "Oujda",
    "Nador",
    "Berkane",
    "Rabat",
    "Sale",
    "Meknes",
    "Errachidia",
    "Fez",
    "Taza",
    "Beni Mellal",
    "Khouribga",
    "Khenifra",
    "Fquih Ben Salah"
]

In [27]:
cities = {}
for city_fr, city_en in zip(french_cit_names, english_city_names):
    cities[city_fr] = city_en
cities
    

{'Tétouan': 'Tetouan',
 'Tanger': 'Tangier',
 'Larache': 'Larache',
 'Kssar El Kébir': 'Ksar el-Kebir',
 'Al Hoceima': 'Al Hoceima',
 'Marrakech': 'Marrakesh',
 'EL Kelaâ des Sraghna': 'El Kelaa of Sraghna',
 'Safi': 'Safi',
 'Essaouira': 'Essaouira',
 'El Jadida': 'El Jadida',
 'Sidi Bennour': 'Sidi Bennour',
 'Settat': 'Settat',
 'Berrechid': 'Berrechid',
 'Casablanca': 'Casablanca',
 'Mohammedia': 'Mohammedia',
 'Kénitra': 'Kenitra',
 'Agadir': 'Agadir',
 'Ait Melloul': 'Ait Melloul',
 'Guelmim': 'Guelmim',
 'Ouarzazate': 'Ouarzazate',
 'Taroudant': 'Taroudant',
 'Es-Semara': 'Smara',
 'Laayoune': 'Laayoune',
 'Dakhla': 'Dakhla',
 'Oujda': 'Oujda',
 'Nador': 'Nador',
 'Berkane': 'Berkane',
 'Rabat': 'Rabat',
 'Salé': 'Sale',
 'Meknès': 'Meknes',
 'Errachidia': 'Errachidia',
 'Fès': 'Fez',
 'Taza': 'Taza',
 'Béni Mellal': 'Beni Mellal',
 'Khouribga': 'Khouribga',
 'Khénifra': 'Khenifra',
 'Fquih Ben Salah': 'Fquih Ben Salah'}

In [29]:
from unidecode import unidecode

# List of French words
french_words = french_cit_names

# Remove accents
plain_words = [unidecode(word) for word in french_words]

print(plain_words)

['Tetouan', 'Tanger', 'Larache', 'Kssar El Kebir', 'Al Hoceima', 'Marrakech', 'EL Kelaa des Sraghna', 'Safi', 'Essaouira', 'El Jadida', 'Sidi Bennour', 'Settat', 'Berrechid', 'Casablanca', 'Mohammedia', 'Kenitra', 'Agadir', 'Ait Melloul', 'Guelmim', 'Ouarzazate', 'Taroudant', 'Es-Semara', 'Laayoune', 'Dakhla', 'Oujda', 'Nador', 'Berkane', 'Rabat', 'Sale', 'Meknes', 'Errachidia', 'Fes', 'Taza', 'Beni Mellal', 'Khouribga', 'Khenifra', 'Fquih Ben Salah']
