## Datasets Matching (Scopus - Google Scholar - Web Of Science)

In [1]:
import pandas as pd
import numpy as np
import unidecode

In [155]:
scop = pd.read_csv("datasets/scopus_dataset.csv")
scop = scop.drop_duplicates()
google_sch = pd.read_csv("datasets/scholar_dataset.csv")
wos= pd.read_csv("datasets/authors_data_wos.csv")


- Harmoniser les noms de colonnes pour simplifier le traitement

In [156]:
# Harmoniser les noms de colonnes pour `wos`
scop.rename(columns={
    "Author_ID": "Author_ID_scop",
    "Author_Name": "Author_Name_scop",
    "Author_Affiliation": "Author_Affiliation_scop",
    "Author_Citations": "Author_Citations_scop",
    "Author_h-index": "Author_h-index_scop",
    "Co_authors_Names": "Co_authors_Names_scop"
}, inplace=True)
wos.rename(columns={
    "ID de l'Auteur": "Author_ID_wos",
    "nom_complet": "Author_Name_wos",
    "pays_affiliation": "Author_Affiliation_wos",
    "Sum of Times Cited": "Author_Citations_wos",
    "H-index": "Author_h-index_wos",
    "co_auteurs": "Co_authors_Names_wos",
    'Articles' : 'Articles_wos' ,
    'Journal' : 'Journal_wos'
}, inplace=True)
google_sch.rename(columns={
    "ID de l'Auteur": "Author_ID_gosch",
    "Nom Complet": "Author_Name_gosch",
    "Pays d'Affiliation": "Author_Affiliation_gosch",
    "Citations Totales": "Author_Citations_gosch",
    "H-index": "Author_h-index_gosch",   
    "FWCI": "Author_FWCI_gosch",
    "Co-auteurs": "Co_authors_Names_gosch"
}, inplace=True)

In [157]:
print(scop.columns)
print(wos.columns)
print(google_sch.columns)

Index(['Author_ID_scop', 'Author_Name_scop', 'Author_Affiliation_scop',
       'Author_Citations_scop', 'Author_Documents', 'Author_h-index_scop',
       'Author_FWCI', 'Co_Authors_IDs', 'Co_authors_Names_scop'],
      dtype='object')
Index(['Author_ID_wos', 'Author_Name_wos', 'Author_Affiliation_wos',
       'Co_authors_Names_wos', 'H-Index', 'Author_Citations_wos',
       'Articles_wos', 'Journal_wos'],
      dtype='object')
Index(['Author_ID_gosch', 'Author_Name_gosch', 'Author_Affiliation_gosch',
       'Author_Citations_gosch', 'Author_h-index_gosch', 'Author_FWCI_gosch',
       'Co_authors_Names_gosch'],
      dtype='object')


In [158]:
scop["Author_Name_scop"][:1]

0    Imad, Hafidi
Name: Author_Name_scop, dtype: object

In [159]:
wos["Author_Name_wos"][:1]

0    Aghriche, Ahmed
Name: Author_Name_wos, dtype: object

In [160]:
google_sch["Author_Name_gosch"][:1]

0    Abdelghani Ghazdali
Name: Author_Name_gosch, dtype: object

**Remplir les valeurs manquantes.**

**Avec les règles:** 
- Priorité à scop.
- Utiliser wos si scop est vide.
- Si les deux sont nulles, insérer NaN.


- Effectuer la jointure entre scop, wos, et google scholar

In [161]:
import re
def name_to_word_set(name):
    if pd.isnull(name):
        return set()  # Gérer les valeurs manquantes
    # Supprimer les caractères spécifiques et normaliser
    cleaned_name = re.sub(r'[^\w\s]', '', name)  # Garder uniquement les lettres, chiffres et espaces
    normalized_name = unidecode.unidecode(cleaned_name.lower().strip())  # Supprimer les accents, passer en minuscule
    return set(normalized_name.split()) 
name_to_word_set("Imad, Hafidi")

{'hafidi', 'imad'}

In [162]:
name_to_word_set("imad hafidi"), name_to_word_set(" Hafidi, Imad"), name_to_word_set("imad hafidi")

({'hafidi', 'imad'}, {'hafidi', 'imad'}, {'hafidi', 'imad'})

In [163]:
def name_to_word_set(name):
    if pd.isnull(name):
        return set() 
    # Supprimer les caractères spécifiques
    cleaned_name = re.sub(r'[^\w\s]', '', name)  
    normalized_name = unidecode.unidecode(cleaned_name.lower().strip())  # Normalisation
    return set(normalized_name.split())  


def find_matches(df1, col1, df2, col2):
    matches = []
    for idx1, name1 in enumerate(df1[col1]):
        set1 = name_to_word_set(name1)
        match_found = None
        for idx2, name2 in enumerate(df2[col2]):
            set2 = name_to_word_set(name2)
            if set1 == set2:
                match_found = df2.iloc[idx2].to_dict()
                print(f"Match found: {name1} ({set1}) == {name2} ({set2})")
                break
#         if match_found is None:
#             print(f"No match for: {name1} ({set1})")
        matches.append(match_found)
    return pd.DataFrame(matches)


# Jointure entre Scopus et WoS
scop_wos_matched = find_matches(scop, 'Author_Name_scop', wos, 'Author_Name_wos')

# Ajouter les résultats de WoS à Scopus
scop_wos_merged = pd.concat([scop, scop_wos_matched.add_suffix('_wos')], axis=1)

# Jointure entre le résultat précédent et Google Scholar
final_matched = find_matches(scop_wos_merged, 'Author_Name_scop', google_sch, 'Author_Name_gosch')

# Ajouter les résultats de Google Scholar
final_merged = pd.concat([scop_wos_merged, final_matched.add_suffix('_gosch')], axis=1)

Match found: Imad, Hafidi ({'hafidi', 'imad'}) == Hafidi, Imad ({'hafidi', 'imad'})
Match found: Aboutabit, Noureddine ({'noureddine', 'aboutabit'}) == noureddine aboutabit ({'noureddine', 'aboutabit'})
Match found: Lamghari, Nidal ({'lamghari', 'nidal'}) == Lamghari, Nidal ({'lamghari', 'nidal'})
Match found: Hadri, Aissam ({'aissam', 'hadri'}) == Hadri, Aissam ({'aissam', 'hadri'})
Match found: Rochd, Yassir ({'yassir', 'rochd'}) == Rochd, Yassir ({'yassir', 'rochd'})
Match found: Nachaoui, Mourad ({'mourad', 'nachaoui'}) == Mourad Nachaoui ({'mourad', 'nachaoui'})
Match found: Aboutabit, Noureddine ({'noureddine', 'aboutabit'}) == noureddine aboutabit ({'noureddine', 'aboutabit'})
Match found: Aghriche, Ahmed ({'aghriche', 'ahmed'}) == Aghriche, Ahmed ({'aghriche', 'ahmed'})
Match found: Dargham, Abdelmajid ({'abdelmajid', 'dargham'}) == Dargham, Abdelmajid ({'abdelmajid', 'dargham'})
Match found: Ghazdali, Abdelghani ({'ghazdali', 'abdelghani'}) == Ghazdali, Abdelghani ({'ghazdali'

AttributeError: 'NoneType' object has no attribute 'keys'

In [165]:
final_merged.head(50)

Unnamed: 0,Author_ID_scop,Author_Name_scop,Author_Affiliation_scop,Author_Citations_scop,Author_Documents,Author_h-index_scop,Author_FWCI,Co_Authors_IDs,Co_authors_Names_scop,0_wos,0_gosch,Author_ID_wos,Author_Name_wos,Author_Affiliation_wos,Co_authors_Names_wos,H-Index,Author_Citations_wos,Articles,Journal
0,15753326700,"Imad, Hafidi",Beni Mellal - Morocco,163,53,7,1.17,"['8907520500', '58184510600', '57194031595', '...","['Ciupercǎ, Ionel Sorin', 'El Bahy, Siham', 'A...",,,,,,,,,,
1,8907520500,"Ciupercǎ, Ionel Sorin",Villeurbanne - France,400,50,12,0.34,"['6603450560', '7003407848', '8907520500', '65...","['Arnaud Heibig', 'Eduard Feireisl', 'Ciupercǎ...",,,,,,,,,,
2,58184510600,"El Bahy, Siham",Beni Mellal - Morocco,0,3,0,0.0,"['58183809000', '58184510600', '16067923100']","['Hind Ait Mait', 'El Bahy, Siham', 'Aboutabit...",,,,,,,,,,
3,57194031595,"Abdellatif, Lasbahani",Beni Mellal - Morocco,13,11,2,0.21,"['57194031595', '57191969067', '6506730355', '...","['Abdellatif, Lasbahani', 'Mostafa Chhiba', 'A...",,,,,,,,,,
4,57202849162,"Khalfi, Hamza",Beni Mellal - Morocco,29,11,3,0.38,"['57222063445', '6701382955', '57202849162', '...","['Ourdou, Amal', 'Guedda, Mohammed', 'Khalfi, ...",,,,,,,,,,
5,57345703800,"Labdiad, Fatah",Beni Mellal - Morocco,0,2,0,0.0,"['15753326700', '57345703800', '57210186235']","['Imad, Hafidi', 'Labdiad, Fatah', 'Nasri, Meh...",,,,,,,,,,
6,56433548800,"Idrissi, Nadia",Khouribga - Morocco,26,6,3,1.16,"['57205406993', '56433548800']","['Ifzarne, Samir', 'Idrissi, Nadia']",,,,,,,,,,
7,16067923100,"Aboutabit, Noureddine",Beni Mellal - Morocco,343,41,7,0.57,"['55505648000', '57208248277', '58184510600', ...","['Said El Kafhali', 'Mohammed Srati', 'El Bahy...",,,,,,,,,,
8,57194380165,"Lamghari, Nidal",Beni Mellal - Morocco,82,9,3,4.87,"['56168902600', '57202849162', '16067923100', ...","['Ghazdali, Abdelghani', 'Khalfi, Hamza', 'Abo...","{'Author_ID_wos': 63349824, 'Author_Name_wos':...",,63349824.0,"Lamghari, Nidal",MOROCCO,[],0.0,0.0,[{'Titre de l’article': 'Subword recognition i...,"[{'issn': '1433-2833', 'scope': 'The large num..."
9,58184222300,"Jbel, Mouad",Beni Mellal - Morocco,0,2,0,0.0,"['15753326700', '58183671900', '58184222300']","['Imad, Hafidi', 'Jabrane, Mourad', 'Jbel, Mou...",,,,,,,,,,


In [118]:
final_merged.head(50)

Unnamed: 0,Author_ID_scop,Author_Name_scop,Author_Affiliation_scop,Author_Citations_scop,Author_Documents,Author_h-index_scop,Author_FWCI,Co_Authors_IDs,Co_authors_Names_scop,0_wos,0_gosch,Author_ID_wos,Author_Name_wos,Author_Affiliation_wos,Co_authors_Names_wos,H-Index,Author_Citations_wos,Articles,Journal
0,15753326700,"Imad, Hafidi",Beni Mellal - Morocco,163,53,7,1.17,"['8907520500', '58184510600', '57194031595', '...","['Ciupercǎ, Ionel Sorin', 'El Bahy, Siham', 'A...",,,,,,,,,,
1,8907520500,"Ciupercǎ, Ionel Sorin",Villeurbanne - France,400,50,12,0.34,"['6603450560', '7003407848', '8907520500', '65...","['Arnaud Heibig', 'Eduard Feireisl', 'Ciupercǎ...",,,,,,,,,,
2,58184510600,"El Bahy, Siham",Beni Mellal - Morocco,0,3,0,0.0,"['58183809000', '58184510600', '16067923100']","['Hind Ait Mait', 'El Bahy, Siham', 'Aboutabit...",,,,,,,,,,
3,57194031595,"Abdellatif, Lasbahani",Beni Mellal - Morocco,13,11,2,0.21,"['57194031595', '57191969067', '6506730355', '...","['Abdellatif, Lasbahani', 'Mostafa Chhiba', 'A...",,,,,,,,,,
4,57202849162,"Khalfi, Hamza",Beni Mellal - Morocco,29,11,3,0.38,"['57222063445', '6701382955', '57202849162', '...","['Ourdou, Amal', 'Guedda, Mohammed', 'Khalfi, ...",,,,,,,,,,
5,57345703800,"Labdiad, Fatah",Beni Mellal - Morocco,0,2,0,0.0,"['15753326700', '57345703800', '57210186235']","['Imad, Hafidi', 'Labdiad, Fatah', 'Nasri, Meh...",,,,,,,,,,
6,56433548800,"Idrissi, Nadia",Khouribga - Morocco,26,6,3,1.16,"['57205406993', '56433548800']","['Ifzarne, Samir', 'Idrissi, Nadia']",,,,,,,,,,
7,16067923100,"Aboutabit, Noureddine",Beni Mellal - Morocco,343,41,7,0.57,"['55505648000', '57208248277', '58184510600', ...","['Said El Kafhali', 'Mohammed Srati', 'El Bahy...",,,,,,,,,,
8,57194380165,"Lamghari, Nidal",Beni Mellal - Morocco,82,9,3,4.87,"['56168902600', '57202849162', '16067923100', ...","['Ghazdali, Abdelghani', 'Khalfi, Hamza', 'Abo...","{'Author_ID_wos': 63349824, 'Author_Name_wos':...",,63349824.0,"Lamghari, Nidal",MOROCCO,[],0.0,0.0,[{'Titre de l’article': 'Subword recognition i...,"[{'issn': '1433-2833', 'scope': 'The large num..."
9,58184222300,"Jbel, Mouad",Beni Mellal - Morocco,0,2,0,0.0,"['15753326700', '58183671900', '58184222300']","['Imad, Hafidi', 'Jabrane, Mourad', 'Jbel, Mou...",,,,,,,,,,


In [167]:
final_merged = final_merged.drop(columns=["0_wos", "0_gosch"])
final_merged.head(1)

Unnamed: 0,Author_ID_scop,Author_Name_scop,Author_Affiliation_scop,Author_Citations_scop,Author_Documents,Author_h-index_scop,Author_FWCI,Co_Authors_IDs,Co_authors_Names_scop,Author_ID_wos,Author_Name_wos,Author_Affiliation_wos,Co_authors_Names_wos,H-Index,Author_Citations_wos,Articles,Journal
0,15753326700,"Imad, Hafidi",Beni Mellal - Morocco,163,53,7,1.17,"['8907520500', '58184510600', '57194031595', '...","['Ciupercǎ, Ionel Sorin', 'El Bahy, Siham', 'A...",,,,,,,,


In [168]:
final_merged.isnull().sum()

Author_ID_scop               0
Author_Name_scop             0
Author_Affiliation_scop      0
Author_Citations_scop        0
Author_Documents             0
Author_h-index_scop          0
Author_FWCI                  0
Co_Authors_IDs               0
Co_authors_Names_scop        0
Author_ID_wos              199
Author_Name_wos            199
Author_Affiliation_wos     199
Co_authors_Names_wos       199
H-Index                    199
Author_Citations_wos       199
Articles                   199
Journal                    199
dtype: int64

In [170]:
import re 

# Fonction pour extraire le nom du pays
def extract_country(affiliation):
    if pd.isnull(affiliation):  # Vérifie si la valeur est NaN
        return None
    match = re.search(r'-\s*(\w+)$', affiliation)  # Recherche le texte après le tiret
    return match.group(1) if match else affiliation  # Retourne le pays s'il est trouvé

# Appliquer la fonction sur la colonne 'Author_Affiliation_scop'
final_merged['Author_Affiliation_scop'] = final_merged['Author_Affiliation_scop'].apply(extract_country)

final_merged = final_merged.drop(columns=["Author_ID_wos", "Author_Name_wos", "Author_Affiliation_wos", "Co_authors_Names_wos", "H-Index","Author_Citations_wos","Articles", "Journal"])

# Afficher les premières lignes pour validation
final_merged["Author_Affiliation_scop"].head(2)

0    Morocco
1     France
Name: Author_Affiliation_scop, dtype: object

In [171]:
final_merged.isnull().sum()

Author_ID_scop             0
Author_Name_scop           0
Author_Affiliation_scop    0
Author_Citations_scop      0
Author_Documents           0
Author_h-index_scop        0
Author_FWCI                0
Co_Authors_IDs             0
Co_authors_Names_scop      0
dtype: int64

In [173]:
try:
    final_merged.to_csv("datasets/merged_authors_datasets.csv", index=False)
    print("Dataset saved")
except: 
    print("Error..")

Dataset saved


In [174]:
df = pd.read_csv("datasets/merged_authors_datasets.csv")
df.head(3)

Unnamed: 0,Author_ID_scop,Author_Name_scop,Author_Affiliation_scop,Author_Citations_scop,Author_Documents,Author_h-index_scop,Author_FWCI,Co_Authors_IDs,Co_authors_Names_scop
0,15753326700,"Imad, Hafidi",Morocco,163,53,7,1.17,"['8907520500', '58184510600', '57194031595', '...","['Ciupercǎ, Ionel Sorin', 'El Bahy, Siham', 'A..."
1,8907520500,"Ciupercǎ, Ionel Sorin",France,400,50,12,0.34,"['6603450560', '7003407848', '8907520500', '65...","['Arnaud Heibig', 'Eduard Feireisl', 'Ciupercǎ..."
2,58184510600,"El Bahy, Siham",Morocco,0,3,0,0.0,"['58183809000', '58184510600', '16067923100']","['Hind Ait Mait', 'El Bahy, Siham', 'Aboutabit..."


In [179]:
df.rename(columns={
    "Author_ID_scop": "Author_ID",
    "Author_Name_scop": "Author_Name",
    "Author_Affiliation_scop" : "Author_Affiliation",
    "Author_Citations_scop": "Author_Citations",
    "Author_Documents": "Author_Documents_number",
    "Author_h-index_scop" : "H-index",
    "Author_FWCI" : "FWCI",
    "Co_authors_Names_scop" : "Co_authors_Names"
}, inplace=True)

In [180]:
df.columns


Index(['Author_ID', 'Author_Name', 'Author_Affiliation', 'Author_Citations',
       'Author_Documents_number', 'H-index', 'FWCI', 'Co_Authors_IDs',
       'Co_authors_Names'],
      dtype='object')

In [182]:
try:
    df.to_csv("datasets/merged_authors_datasets.csv", index=False)
    print("Dataset saved")
except: 
    print("Error..")

Dataset saved


**Documents/Journals Matching**

In [2]:
df1 = pd.read_csv("datasets/merged_authors_datasets.csv")
df2 = pd.read_json('datasets/scopus_docs_journals.json')
df3_articles = pd.read_csv("datasets/partial_article_info_gosch.csv")
df4_journals = pd.read_csv('datasets/partial_journal_info_gosch.csv')

In [3]:
df1.head(1)

Unnamed: 0,Author_ID,Author_Name,Author_Affiliation,Author_Citations,Author_Documents_number,H-index,FWCI,Co_Authors_IDs,Co_authors_Names
0,15753326700,"Imad, Hafidi",Morocco,163,53,7,1.17,"['8907520500', '58184510600', '57194031595', '...","['Ciupercǎ, Ionel Sorin', 'El Bahy, Siham', 'A..."


In [4]:
df2.head(1)

Unnamed: 0,author id,Nom_Complet,Affiliation,Citations,h-index,FWCI,co authors,articles
0,14054072000,"Kabbaj, Adil",Rabat - Morocco,95.0,6.0,0.33,"[Rosso, Paolo, Frasson, Claude, Moulin, Bernar...",[{'title': 'Characterizing land use-land cover...


In [5]:
df3_articles.head(1)

Unnamed: 0,Titre de l'article,Auteurs,Année de publication,Titre de source,Nombre de citations,Résumé,DOI,Mots-clés,Type de document,ID de l'Auteur,ISSN
0,A multi-frame super-resolution using diffusion...,"Amine Laghrib, Abdelghani Ghazdali, Abdelilah ...",2016.0,Computers & Mathematics with Applications,57.0,"In this paper, we present a new approach of mu...",https://www.sciencedirect.com/science/article/...,,Article,M_pAZvwAAAAJ,8981221


In [6]:
df4_journals.head(1)

Unnamed: 0,Nom,Editeur,ISSN,Index,H-index,Quartile,SJR,Impact factor,Portee thematique
0,Computers and Mathematics with Applications,Elsevier Ltd,8981221,1975-2023,154.0,Q1,3.084,2.944,Computers & Mathematics with Applications prov...


In [7]:
df3_articles.shape ,df4_journals.shape

((7849, 11), (7422, 9))

In [15]:
df3_articles.duplicated().sum(), df4_journals.duplicated().sum()

(1, 6331)

In [16]:
print(df3_articles['ISSN'].nunique())  # Nombre d'ISSN uniques dans df3_articles
print(df4_journals['ISSN'].nunique())  # Nombre d'ISSN uniques dans df4_journals

835
835


In [17]:
print(df3_articles['ISSN'].value_counts())  # Vérifie combien de fois chaque ISSN apparaît
print(df4_journals['ISSN'].value_counts())

ISSN
02181274, 17936551    351
-                     238
09600779              191
24700045, 24700053    115
03784371              106
                     ... 
03064573, 18735371      1
15407063, 15577023      1
02194937, 17936799      1
15628353, 18137385      1
15587916, 15587924      1
Name: count, Length: 835, dtype: int64
ISSN
02181274, 17936551    351
-                     238
09600779              191
24700045, 24700053    115
03784371              106
                     ... 
03064573, 18735371      1
15407063, 15577023      1
02194937, 17936799      1
15628353, 18137385      1
15587916, 15587924      1
Name: count, Length: 835, dtype: int64


In [29]:
df3_articles_unique = df3_articles.drop_duplicates(subset=['ISSN'])
df4_journals_unique = df4_journals.drop_duplicates(subset=['ISSN'])

merged_df_gosch = pd.merge(df3_articles_unique, df4_journals_unique, on='ISSN', how='inner')
print(merged_df_gosch.shape)  # Cela devrait limiter le nombre de lignes


(836, 19)


In [30]:
merged_df_gosch.head(2)

Unnamed: 0,Titre de l'article,Auteurs,Année de publication,Titre de source,Nombre de citations,Résumé,DOI,Mots-clés,Type de document,ID de l'Auteur,ISSN,Nom,Editeur,Index,H-index,Quartile,SJR,Impact factor,Portee thematique
0,A multi-frame super-resolution using diffusion...,"Amine Laghrib, Abdelghani Ghazdali, Abdelilah ...",2016.0,Computers & Mathematics with Applications,57.0,"In this paper, we present a new approach of mu...",https://www.sciencedirect.com/science/article/...,,Article,M_pAZvwAAAAJ,8981221,Computers and Mathematics with Applications,Elsevier Ltd,1975-2023,154.0,Q1,3.084,2.944,Computers & Mathematics with Applications prov...
1,A new method for the extraction of fetal ECG f...,"Abdelghani Ghazdali, Abdelilah Hakim, Amine La...",2015.0,Theoretical Biology and Medical Modelling,31.0,Background\nThe electrocardiogram (ECG) is a d...,https://link.springer.com/article/10.1186/s129...,,Article,M_pAZvwAAAAJ,17424682,Theoretical Biology and Medical Modelling,BioMed Central Ltd,2004-2021,56.0,Q3,1.389,1.484,Theoretical Biology and Medical Modelling is a...


In [31]:
merged_df_gosch.duplicated().sum()

0

In [32]:
merged_df_gosch.isna().sum()

Titre de l'article        5
Auteurs                   0
Année de publication     19
Titre de source           0
Nombre de citations     136
Résumé                   57
DOI                       5
Mots-clés               836
Type de document          0
ID de l'Auteur            0
ISSN                      1
Nom                       1
Editeur                  17
Index                     1
H-index                   1
Quartile                  1
SJR                       1
Impact factor             1
Portee thematique         1
dtype: int64

In [131]:
merged_df_gosch.columns = [col + "_gosch" for col in merged_df_gosch.columns]

- Les Articles de Scopus

In [34]:
df2["articles"]

0      [{'title': 'Characterizing land use-land cover...
1      [{'title': 'Artificial intelligence for assess...
2      [{'title': 'Enhancing Entity Resolution with a...
3      [{'title': 'A HYBRID MODEL FOR ARABIC SCRIPT R...
4      [{'title': 'AN INVERSE PROBLEM OF IDENTIFYING ...
                             ...                        
135    [{'title': 'Hyperspectral Image Completion Via...
136    [{'title': 'Existence and uniqueness for a cou...
137    [{'title': 'What distinguishes conspiracy from...
138    [{'title': 'Zero-day attack detection: a syste...
139    [{'title': 'Machine learning analysis of breas...
Name: articles, Length: 140, dtype: object

In [46]:
df2["articles"][0]


[{'title': 'Characterizing land use-land cover changes in N’fis watershed, Western High Atlas, Morocco (1984–2022)',
  'pub year': '2024',
  'citations': '0',
  'issn': '18669298',
  'DOI': '10.1007/s12518-024-00549-8',
  'document type': 'Article',
  'source type': 'Journal',
  'abstract': 'The examination of changes in land use and land cover (LULC) holds a pivotal role in advancing our comprehension of underlying processes and mechanisms. The advancement of sophisticated earth observation programs has opened unprecedented opportunities to meticulously observe geographical areas, courtesy of the vast array of satellite imagery available across time. However, effectively analyzing this wealth of data to process LULC information remains a significant challenge within remote sensing. Recent times have witnessed the introduction of diverse techniques for scrutinizing satellite images, encompassing remote sensing technologies and machine/deep learning (M/DL) methods. This research endeavo

In [39]:
df2["articles"].isnull().sum()

0

**Résultat attendu : Chaque ligne représente un article avec ses informations détaillées.**

- title
- pub_year
- citations
- issn
- DOI
- document type
- source type
- abstract
- authors
- author_keywords
- Nom de la revue
- H-index
- Editeur
- journal_issn
- index
- Portee thematique
- Quartile.quartile_value
- Score SJR
- Impact Factor.impact_factor_value


In [133]:
# Assuming df2 is the DataFrame containing the 'articles' column
data = df2["articles"]

processed_data = []

# Iterate over each list in the 'articles' column
for article_list in data:
    if isinstance(article_list, list):  # Ensure the entry is a list
        for article in article_list:  # Iterate over each dictionary (article)
            if isinstance(article, dict):  # Ensure the element is a dictionary
                row = {
                    "title": article.get("title"),
                    "pub_year": article.get("pub year"),
                    "citations": article.get("citations"),
                    "issn": article.get("issn"),
                    "DOI": article.get("DOI"),
                    "document_type": article.get("document type"),
                    "source_type": article.get("source type"),
                    "abstract": article.get("abstract"),
                    "authors": ", ".join(article.get("authors", [])),
                    # Ensure 'author keywords' is a list or empty list before joining
                    "author_keywords": ", ".join(article.get("author keywords", [])) if isinstance(article.get("author keywords", []), list) else ""
                }

                # Check if 'journal info' exists and is not None before processing
                journal_info = article.get("journal info")
                if journal_info:
                    row.update({
                        "journal_name": journal_info.get("name"),
                        "journal_publisher": journal_info.get("publisher"),
                        "journal_issn": journal_info.get("issn"),
                        "journal_scope": journal_info.get("scope"),
                        "journal_quartile": journal_info.get("quartile"),
                    })
                
                processed_data.append(row)
            else:
                print(f"Unexpected article type: {type(article)}, article: {article}")
    else:
        print(f"Unexpected entry type in data: {type(article_list)}, entry: {article_list}")

# Convert the processed data to a DataFrame for easier analysis
processed_df = pd.DataFrame(processed_data)

In [134]:
processed_df.head(2)

Unnamed: 0,title,pub_year,citations,issn,DOI,document_type,source_type,abstract,authors,author_keywords,journal_name,journal_publisher,journal_issn,journal_scope,journal_quartile
0,Characterizing land use-land cover changes in ...,2024,0,18669298,10.1007/s12518-024-00549-8,Article,Journal,The examination of changes in land use and lan...,"Salhi, Wiam, Heddoun, Ouissal, Honnit, Bouchra...","Deep learning, GIS, Land use-land cover, Machi...",,,18669298,,
1,A review & analysis of current IoT maturity & ...,2023,6,24682276,10.1016/j.sciaf.2023.e01748,Article•,Journal,Internet of Things (IoT) environments are char...,"Benotmane, Meryem, Elhari, Kaoutar, Kabbaj, Adil","Digital transformation, Industry 4.0, IoT matu...",,,24682276,,


- renommer les colonnes

In [135]:
processed_df.columns = [col + "_scopus" for col in processed_df.columns]

In [136]:
processed_df.isnull().sum()

title_scopus                   0
pub_year_scopus               30
citations_scopus               0
issn_scopus                  335
DOI_scopus                   650
document_type_scopus           0
source_type_scopus             0
abstract_scopus              224
authors_scopus                 0
author_keywords_scopus         0
journal_name_scopus         5801
journal_publisher_scopus    5801
journal_issn_scopus          463
journal_scope_scopus        5801
journal_quartile_scopus     5801
dtype: int64

In [137]:
print(processed_df.shape, processed_df.duplicated().sum())
processed_df = processed_df.drop_duplicates()

(5801, 15) 905


In [138]:
processed_df.duplicated().sum()

0

In [139]:
processed_df.isna().sum()

title_scopus                   0
pub_year_scopus               30
citations_scopus               0
issn_scopus                  294
DOI_scopus                   566
document_type_scopus           0
source_type_scopus             0
abstract_scopus              185
authors_scopus                 0
author_keywords_scopus         0
journal_name_scopus         4896
journal_publisher_scopus    4896
journal_issn_scopus          411
journal_scope_scopus        4896
journal_quartile_scopus     4896
dtype: int64

In [140]:
merged_df_gosch.isna().sum()


Titre de l'article_gosch        5
Auteurs_gosch                   0
Année de publication_gosch     19
Titre de source_gosch           0
Nombre de citations_gosch     136
Résumé_gosch                   57
DOI_gosch                       5
Mots-clés_gosch               836
Type de document_gosch          0
ID de l'Auteur_gosch            0
ISSN_gosch                      1
Nom_gosch                       1
Editeur_gosch                  17
Index_gosch                     1
H-index_gosch                   1
Quartile_gosch                  1
SJR_gosch                       1
Impact factor_gosch             1
Portee thematique_gosch         1
dtype: int64

**Explications des mappages :**

- title_scopus ↔ Titre de l'article_gosch
- pub_year_scopus ↔ Année de publication_gosch
- citations_scopus ↔ Nombre de citations_gosch
- issn_scopus ↔ ISSN_gosch
- DOI_scopus ↔ DOI_gosch
- abstract_scopus ↔ Résumé_gosch
- journal_name_scopus ↔ Nom_gosch
- journal_publisher_scopus ↔ Editeur_gosch
- journal_issn_scopus ↔ ISSN_gosch
- journal_scope_scopus ↔ Portee thematique_gosch
- journal_quartile_scopus ↔ Quartile_gosch

In [141]:
# Colonnes correspondantes à synchroniser entre les deux DataFrames
columns_to_sync = {
    "title_scopus" : "Titre de l'article_gosch",
    "pub_year_scopus" : "Année de publication_gosch",
    "citations_scopus" : "Nombre de citations_gosch",
    "issn_scopus" : "ISSN_gosch",
    "DOI_scopus" : "DOI_gosch",
    "abstract_scopus" : "Résumé_gosch",
    "journal_name_scopus" : "Nom_gosch",
    "journal_publisher_scopus" : "Editeur_gosch",
    "journal_issn_scopus" : "ISSN_gosch",
    "journal_scope_scopus" : "Portee thematique_gosch",
    "journal_quartile_scopus" : "Quartile_gosch"
}

In [142]:
# Effectuer une jointure sur la colonne ISSN
merged_result = processed_df.merge(
    merged_df_gosch,
    left_on="issn_scopus",
    right_on="ISSN_gosch",
    how="outer"
)

In [143]:
# Synchroniser les colonnes
for processed_col, gosch_col in columns_to_sync.items():
    merged_result[processed_col] = merged_result[processed_col].combine_first(merged_result[gosch_col])
    merged_result[gosch_col] = merged_result[gosch_col].combine_first(merged_result[processed_col])

In [145]:
merged_result.head()

Unnamed: 0,title_scopus,pub_year_scopus,citations_scopus,issn_scopus,DOI_scopus,document_type_scopus,source_type_scopus,abstract_scopus,authors_scopus,author_keywords_scopus,...,ID de l'Auteur_gosch,ISSN_gosch,Nom_gosch,Editeur_gosch,Index_gosch,H-index_gosch,Quartile_gosch,SJR_gosch,Impact factor_gosch,Portee thematique_gosch
0,Numerical result on nonlinear filtering for co...,1995.0,2.0,-,https://scholar.google.com/scholar?cluster=174...,,,,,,...,DVdULsAAAAAJ,-,Proceedings - 2nd Asia International Conferenc...,,-,17.0,0.000,119.0,184.0,Join the conversation about this journal
1,A panoramic survey of natural language process...,2021.0,76.0,00010782,10.1145/3447735,Review•,Journal,"Natural language processing (NLP), called comp...","Darwish, Kareem, Habash, Nizar, Abbas, Mourad,...",,...,,00010782,,,,,,,,
2,Recovering deterministic behavior from experim...,1997.0,52.0,"00011541, 15475905",https://aiche.onlinelibrary.wiley.com/doi/abs/...,,,The velocity field in a standard mixing reacto...,,,...,il6x-fUAAAAJ,"00011541, 15475905",AICHE Journal,American Institute of Chemical Engineers,1955-2023,188.0,Q2,3.739,3.847,The AIChE Journal is the premier research mont...
3,Résultats du premier essai randomisé de phase ...,2016.0,1.0,00014079,https://www.sciencedirect.com/science/article/...,,,Introduction\nLe suivi des symptômes des patie...,,,...,il6x-fUAAAAJ,00014079,Bulletin de l'Academie Nationale de Medecine,Elsevier Masson s.r.l.,"1947-1949, 1951-2023",26.0,Q4,0.202,0.207,Rédigé par des spécialistes à l'intention d'un...
4,Advances in Intelligent System and Smart Techn...,2024.0,,"00014826, 15587967",https://books.google.com/books?hl=en&lr=&id=5W...,,,This book is a collection of high-quality peer...,,,...,mrQFmUkAAAAJ,"00014826, 15587967",Accounting Review,American Accounting Association,1996-2023,192.0,Q1,4.668,5.826,According to the policies set by the Publicati...


In [146]:
# Diviser à nouveau en deux DataFrames avec les colonnes mises à jour
updated_processed_df = merged_result[[col for col in processed_df.columns]]
updated_merged_df_gosch = merged_result[[col for col in merged_df_gosch.columns]]

In [148]:
updated_processed_df.isnull().sum()

title_scopus                   4
pub_year_scopus               47
citations_scopus             124
issn_scopus                  294
DOI_scopus                   451
document_type_scopus         732
source_type_scopus           732
abstract_scopus              218
authors_scopus               732
author_keywords_scopus       732
journal_name_scopus         4128
journal_publisher_scopus    4145
journal_issn_scopus          294
journal_scope_scopus        4128
journal_quartile_scopus     4128
dtype: int64

In [149]:
processed_df["title_scopus"]
title_scopus, document_type_scopus

title_scopus                   0
pub_year_scopus               30
citations_scopus               0
issn_scopus                  294
DOI_scopus                   566
document_type_scopus           0
source_type_scopus             0
abstract_scopus              185
authors_scopus                 0
author_keywords_scopus         0
journal_name_scopus         4896
journal_publisher_scopus    4896
journal_issn_scopus          411
journal_scope_scopus        4896
journal_quartile_scopus     4896
dtype: int64

Fin.