In [34]:
import pandas as pd
import json
import re
import unicodedata

In [35]:
df = pd.read_csv('../data/raw/scopus/2023.csv')

In [36]:
# Function to extract IDs between parentheses
def extract_ids(text):
    return re.findall(r'\((\d+)\)', text)

In [37]:
def extract_country_name(affiliation_full_name):
    try:
        parts = affiliation_full_name.lower().split(',')
        country = parts[-1].strip().lower()
        return country
    except:
        return None

In [38]:
# Load the cities dictionary from the JSON file
with open('mappers/cities_mapping.json', 'r', encoding='utf-8') as json_file:
    cities_mapping = json.load(json_file)
# cities_mapping

In [39]:
def extract_city_name(affiliation_full_name):
    
    # for each key in the dict, if the key exists return the value
    affiliation = affiliation_full_name.lower()
    parts = affiliation.split(',')
    
    expected_city = None
    if len(parts) > 1:
        expected_city = parts[-2].strip().lower()

        for key, value in cities_mapping.items():
            if key == expected_city:
                return value
    
    for key, value in cities_mapping.items():
        if key in affiliation:
            return value
    
    return None

In [40]:
# Load the affiliation by city dictionary from the JSON file
with open('mappers/affiliations_by_city.json', 'r', encoding='utf-8') as json_file:
    affiliations_by_city = json.load(json_file)

# Load the universities by city dictionary from the JSON file
with open('mappers/universities_by_city.json', 'r', encoding='utf-8') as json_file:
    universities_by_city = json.load(json_file)

In [41]:
def extract_author_id_name(auth_id_name):
    # extract author name
    name_pattern = r"[A-Za-záéíóúÁÉÍÓÚñÑ]+,\s[A-Za-záéíóúÁÉÍÓÚñÑ]+"
    name_match = re.search(name_pattern, auth_id_name)

    # extract id
    id_match = re.search(r'\((\d+)\)', auth_id_name)
    
    if id_match and name_match:
        author_id = id_match.group(1)
        author_name = name_match.group()
        return author_id, author_name
    else:
        return None, None

In [42]:
def remove_accents(text):
    # Normalize the text to decompose characters with accents
    normalized_text = unicodedata.normalize('NFKD', text)
    # Filter out the accents and keep only ASCII characters
    return ''.join(char for char in normalized_text if not unicodedata.combining(char))

In [43]:
def normalize_digits(affiliation):
    # Patterns for variations
    patterns = {
        r'\b(first|1st|1er|i)\b': '1',  # Variations of 1
        r'\b(second|2nd|ii)\b': '2',    # Variations of 2
        r'\b(fifth|5th|v)\b': '5'       # Variations of 5
    }
    
    normalized_affiliation = affiliation
    for pattern, replacement in patterns.items():
        # Replace matched patterns with their respective replacements
        normalized_affiliation = re.sub(pattern, replacement, normalized_affiliation)
    return normalized_affiliation

In [44]:
def normalize_affiliation(affiliation):
    # lower case affiliation name
    affiliation = affiliation.lower()
    # remove accents to simply the mapping
    affiliation = remove_accents(affiliation)
    # normalize digits
    affiliation = normalize_digits(affiliation)
    
    return affiliation

In [45]:
def extract_affiliation_name(affiliation_full_name, city):
    
    if not city:
        return None, None
    
    affiliation = normalize_affiliation(affiliation_full_name)

    for key, values in affiliations_by_city[city].items():
        for val in values:
            if re.search(rf'\b{re.escape(val)}\b', affiliation):
                return key, val

    if city not in universities_by_city.keys():
        return None, None
        
    for key, values in universities_by_city[city].items():
        for val in values:
            if re.search(rf'\b{re.escape(val)}\b', affiliation):
                return key, val
    
    return None, None

In [46]:
def duplicate_rows_by_author(df):
    new_df = pd.DataFrame(columns=["author_id", "author_name", "affiliation_full_name", "city", "affiliation", "affiliation_id"])

    for _, row in df.iterrows():
        authors_with_ids = row["Author full names"].split(';')
        authors_with_affiliations = row["Affiliations"].split(';')
        ids, author_names, affiliations_full_name, cities, affiliations, affiliation_ids = [], [], [], [], [], []
        
        for auth_id_name, aff in zip(authors_with_ids, authors_with_affiliations):
            
            author_id, author_name = extract_author_id_name(auth_id_name)
            
            if not (author_id and author_name):
                continue

            # Extract country
            country = extract_country_name(aff)
            if country not in ["morocco", "maroc"]:
                continue
            
            # Extract city
            city = extract_city_name(aff)

            # Extract affiliation and aff id
            aff_id, affiliation = extract_affiliation_name(aff, city)

            ids.append(author_id)
            author_names.append(author_name)
            affiliations_full_name.append(aff)
            cities.append(city)
            affiliation_ids.append(aff_id)
            affiliations.append(affiliation)

        data = pd.DataFrame({
            "author_id": ids,
            "author_name": author_names,
            "affiliation_full_name": affiliations_full_name,
            "city": cities,
            "affiliation": affiliations,
            "affiliation_id": affiliation_ids
        })

        new_df = pd.concat([new_df, data], ignore_index=True)
        
    return new_df

In [47]:
trans_df = duplicate_rows_by_author(df)

In [48]:
trans_df.to_csv("../data/transformed/2023.csv", index=False)

In [50]:
# trans_df['affiliation']
# trans_df.columns
trans_df.count()

author_id                28948
author_name              28948
affiliation_full_name    28948
city                     26112
affiliation              19522
affiliation_id           19522
dtype: int64

In [49]:
transformed = trans_df[['affiliation_full_name', 'affiliation']]
transformed.to_csv("../data/transformed/result.csv", index=False)