In [1]:
import pandas as pd
from ast import literal_eval
import texthero as hero
import nltk
from nltk import bigrams
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
import itertools
import networkx as nx
from networkx.readwrite import json_graph
import pylab
import matplotlib.pyplot as plt
from pyvis.network import Network
import plotly.express as px
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import json

*Ce notebook suit le plan du notebook quarto final en donnant les codes des fonctions et opérations réalsiées pour chaque étape*

# Constitution du corpus

## Scopus

In [32]:
scopus_data = pd.read_csv("data/0_exported_raw_data/scopus-subset-2023-05-26.csv", sep=",", encoding="utf-8")
# Step : only publis with DOi (otherwise too much duplicates)
scopus_data = scopus_data[scopus_data.DOI.notna()].drop_duplicates(subset=['DOI'])
# Step : remove empty title
scopsu_data = scopus_data.loc[~(scopus_data['Title'].isna())]
# Step: Drop columns
scopus_data = scopus_data.drop(columns=['Author full names', 'Author(s) ID', 'Link'])
# Step: Rename multiple columns
scopus_data = scopus_data.rename(columns={'Year': 'publicationYear', 'Source title': 'sourceTitle', 'Document Type': 'documentType'})
# Step: Rearranged the order of the columns
scopus_data = scopus_data[['Source', 'DOI', 'Title', 'publicationYear', 'documentType', 'Authors', 'Publisher', 'sourceTitle', 'Abstract'] + []]
# Step : save
scopus_data.to_csv("data/1_primary_corpus/scopus_data.csv", index=False, encoding="utf-8")

## Istex

In [11]:
# Starting with the downloaded folder of json files from API
def get_all_json_files(directory):
    json_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".json"):
                json_files.append(os.path.join(root, file))
    return json_files

# Provide the root directory containing the subfolders and JSON files
root_directory = "data/0_exported_raw_data/istex-subset-2023-05-30"
all_json_files = get_all_json_files(root_directory)

# Process data with keeping only documents with DOI
istex_data = []
for json_file in all_json_files:
    if not 'manifest' in json_file:
        with open(json_file, 'r', encoding='utf-8') as file:
            result = {}
            json_content = json.load(file)
            #data = pd.json_normalize(json_content, max_level=2)
            result["Source"] = "Istex"
            if 'doi' in json_content:
                result["DOI"] = json_content["doi"][0]
                result["Title"] = json_content["title"]
                result["publicationYear"] = json_content["publicationDate"]
                result["documentType"] = json_content["genre"][0]
                if "author" in json_content:
                    result["Authors"] = "; ".join([a["name"] for a in json_content["author"]])
                if "title" in json_content["host"]:
                    result["sourceTitle"] = json_content["host"]["title"] 
                result["Publisher"] = json_content["corpusName"]
                if "abstract" in json_content:
                    result["Abstract"] = json_content["abstract"]
            data = pd.DataFrame([result])
            istex_data.append(data)
istex_data = pd.concat(istex_data)
# Step : deduplicate only applied on non empty DOI rows
istex_data = istex_data[istex_data.DOI.notna()].drop_duplicates(subset=['DOI'])
# Save 
istex_data.to_csv("data/1_primary_corpus/istex_data.csv", index=False, encoding="utf-8")
# write DataFrame to an excel sheet
#istex_data.to_excel('istex_data.xlsx')

## Dédoublonnage et corpus final

In [34]:
df_tmp = pd.concat([scopus_data,istex_data])
for column_name in df_tmp.columns:
    df_tmp[column_name] = df_tmp[column_name].astype('string')
df_final = (df_tmp
            .drop_duplicates(subset=['DOI'])
            .reset_index(drop=True)
           )
# Step: Manipulate strings of 'documentType' via Find 'article' and Replace with 'Article'
df_final["documentType"] = df_final["documentType"].str.replace('article', 'Article', regex=False)
# Step: Filter on main documentType
#df_final = df_final.loc[df_final['documentType'].isin(['Article', 'Review', 'Data paper', 'Book chapter', 'Book', 'Conference paper'])]
df_final.to_csv("data/1_primary_corpus/extracted_corpus.csv", index=False, encoding="utf-8")

# Analyse lexicométrique sur les titres

In [7]:
## divide strings into lists of substrings
tokenizer = RegexpTokenizer(r'\w+')
def tokenize(x):
    return [token for token in tokenizer.tokenize(x.strip()) if ((token != u"") & (len(token)>2))]

## Lemmatization is the process of grouping together the different inflected forms of a word so they can be analyzed as a single item. Lemmatization is similar to stemming but it brings context to the words. So it links words with similar meanings to one word. 
lemmatizer = WordNetLemmatizer()
def lemmatize(x):
    return [lemmatizer.lemmatize(word) for word in x]

## list ow words to string
def list_to_string(x):
    return ' '.join([word for word in x])


def clean_text(df,col):
    df[f"{col}_clean_tmp"] = df[f"{col}"].pipe(hero.clean)
    df[f'{col}_token'] = df[f"{col}_clean_tmp"].apply(lambda x: tokenize(x))
    df[f'{col}_token_list'] = df[f'{col}_token'].apply(lambda x: lemmatize(x))
    df[f'{col}_cleaned'] = df[f'{col}_token_list'].apply(lambda x: list_to_string(x))
    return (df
            .drop(columns=[f"{col}_clean_tmp",f"{col}_token"])
           )

## Count vectorizer -> unigrams freq
### on crée l'objet CountVectorizer
count = CountVectorizer()
## méthode fit_transform pour générer la matrice
def bag_of_word_to_freq(df,col):
    df = df[df[col].notna()]
    bag_of_words = count.fit_transform(df[col])
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in count.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq

## bigrams dataframe
def get_bigrams(df,col):
    list_tokens = df[col].to_list()
    data = list(itertools.chain.from_iterable(list_tokens))
    bi_grams = list(bigrams(data))
    bigram_freq = nltk.FreqDist(bi_grams).most_common(len(bi_grams))
    bigram_df = pd.DataFrame(bigram_freq,columns=['bigram', 'count'])
    return bigram_df

## Traitement NLP

In [4]:
df_final = pd.read_csv("data/1_primary_corpus/extracted_corpus.csv", sep=",", encoding="utf-8")
for c in df_final.columns:
    df_final[c] = df_final[c].astype(str)
df_final["Abstract"] = df_final["Abstract"].apply(lambda x: (x
                                                 .replace("Abstract: ","")
                                                 .replace("ABSTRACT: ","")
                                                 .split("©")[0])
                                     )
df_final = clean_text(df_final, "Abstract")
df_final = clean_text(df_final, "Title")
df_final.to_csv("data/2_results/nlp_corpus.csv", index=False, encoding="utf-8")
df_final.to_excel('data/2_results/nlp_corpus.xlsx')

## n-grammes

In [5]:
df = pd.read_csv("data/2_results/nlp_corpus.csv", sep=",", encoding="utf-8")

In [9]:
meta_df = {}
for x in ["Title", "Abstract"]:
    # si df = pd.read_csv(..) car df.to_csv(...) transforme les list en string
    #df[f"{x}_token_list"] = df[f"{x}_token_list"].apply(literal_eval)
    # Unigrammes
    meta_df[f"df_{x}_unigrams"] = pd.DataFrame(bag_of_word_to_freq(df,f'{x}_cleaned'), columns = ['word' , 'count'])
    # Bigrammes
    meta_df[f"df_{x}_bigrams"] = get_bigrams(df,f'{x}_token_list')
    meta_df[f"df_{x}_bigrams"]['bigram_to_string'] = meta_df[f"df_{x}_bigrams"]['bigram'].apply(lambda y: ','.join(y))
    # Step : save
    meta_df[f"df_{x}_unigrams"].to_csv(f"data/2_results/{x}_unigrams.csv", index=False, encoding="utf-8")
    meta_df[f"df_{x}_unigrams"].to_excel(f'data/2_results/{x}_unigrams.xlsx')
    meta_df[f"df_{x}_bigrams"].to_csv(f"data/2_results/{x}_bigrams.csv", index=False, encoding="utf-8")
    meta_df[f"df_{x}_bigrams"].to_excel(f'data/2_results/{x}_bigrams.xlsx')