# Organizing PoP PopCites csv files 
In PoP export as CSV the first N most cited papers (with h-markers) in Revue Finance

For each of these papers, gather "Citing works" data (keep top cited papers with h-markers)

Export these (N files) as CSV: RF papers as PopCites and then citing works for each RF paper as PopCites1..N

### Importing data in csv from PoP

In [1]:
import pandas as pd

# Load the base papers data
# base_papers = pd.read_csv(r"C:\Users\GODLEWSKI\OneDrive - unistra.fr\EDITORIAL & REFEREEING\Revue FINANCE\ranking\Pop\PopCites.csv") # base file with 9 most cited papers
base_papers = pd.read_csv(r"C:\Users\GODLEWSKI\OneDrive - unistra.fr\EDITORIAL & REFEREEING\Revue FINANCE\ranking\PoP\PopCites_.csv") # larger file with 21 most cited papers

# Assign unique IDs to the base papers
base_papers['Unique ID'] = range(1, len(base_papers) + 1)

# Initialize an empty DataFrame for the edges
edges = pd.DataFrame(columns=['Citing Paper Unique ID', 'Cited Paper Unique ID'])

# Loop through each of the citing papers files
for i in range(1, 22): # 10 for base file or 22 for larger file
    # Load the citing papers data
    citing_papers = pd.read_csv(fr"C:\Users\GODLEWSKI\OneDrive - unistra.fr\EDITORIAL & REFEREEING\Revue FINANCE\ranking\PoP\PoPCites{i}_updated.csv", encoding='Windows-1252', delimiter=';')
    
    # Sort the citing papers by their citation count and select the top 3
    citing_papers = citing_papers.sort_values(by='Cites', ascending=False).head(3)

    # Assign unique IDs to the citing papers, continuing the numbering from the base papers
    citing_papers['Unique ID'] = range(len(base_papers) + 1, len(base_papers) + 1 + len(citing_papers))
    
    # Concatenate the citing papers to the base papers DataFrame
    base_papers = pd.concat([base_papers, citing_papers], ignore_index=True)
    
    # Create edges for the citing relationships
    edges = pd.concat([edges, pd.DataFrame({
        'Citing Paper Unique ID': citing_papers['Unique ID'],
        'Cited Paper Unique ID': [i] * len(citing_papers)  # Assuming the base paper has the same ID as the file number
    })], ignore_index=True)

# Save the combined papers data and the edges to new CSV files
base_papers.to_csv('UnifiedPapers.csv', index=False)
edges.to_csv('Edges.csv', index=False)


## Combine all files with citing papers into a single file and generate a file with only top 3 papers

In [2]:
import pandas as pd

# List to store all the DataFrames
dfs = []

# Loop through the 21 CSV files
for i in range(1, 22):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(fr"C:\Users\GODLEWSKI\OneDrive - unistra.fr\EDITORIAL & REFEREEING\Revue FINANCE\ranking\PoP\PoPCites{i}_updated.csv", encoding='Windows-1252', delimiter=';')
    # Append the DataFrame to the list
    dfs.append(df)

# Concatenate all the DataFrames into one
all_citing_papers = pd.concat(dfs, ignore_index=True)

# Write the DataFrame to a CSV file
all_citing_papers.to_csv('AllCitingPapers.csv', index=False)

# Convert the 'Cites' column to integers
all_citing_papers['Cites'] = all_citing_papers['Cites'].astype(int)

# Define a function that sorts a DataFrame and returns the top 3 rows
def top_3(df):
    return df.sort_values('Cites', ascending=False).head(3)

# Group the DataFrame by the 'Source' column and apply the top_3 function
top_3_citing_papers = all_citing_papers.groupby('Source').apply(top_3)

# Reset the index
top_3_citing_papers.reset_index(drop=True, inplace=True)

# Write the DataFrame to a CSV file
top_3_citing_papers.to_csv('Top3CitingPapersPerSource.csv', index=False)

  top_3_citing_papers = all_citing_papers.groupby('Source').apply(top_3)


## Generate 2 graphs: 1 for most cited papers in RF & 1 for most citing papers

In [3]:
import csv
import networkx as nx
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.offline import plot

def standardize_authors(name):
    # Remove asterisks and standardize the format
    name = name.replace('*', '').strip()  # Strip to remove any leading/trailing whitespace
    name = name.replace('1', '')
    name = name.replace('2', '')
    name = name.replace('3', '')
    # split by spaces
    parts = name.split()
    # Remove middle initials and suffixes if needed
    # Reorder to 'Lastname, Firstname' if needed
    return ' '.join(parts)

def read_papers_from_csv(file_path):
    with open(file_path, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        papers = []
        for row in reader:
            paper = row.copy()
            if 'Authors' in paper:
                paper['Authors'] = standardize_authors(paper['Authors'])
            papers.append(paper)
        return papers

def format_authors(authors_string):
    authors_list = authors_string.split(", ")
    if len(authors_list) > 2:
        return f"{authors_list[0]} et al."
    elif len(authors_list) == 2:
        return f"{authors_list[0]} & {authors_list[1]}"
    else:
        return authors_list[0]

def networkx_to_plotly(G, title, label_type, footnote1, footnote2):
    # Generate positions for the nodes
    pos = nx.spring_layout(G)

    # Add 'pos' attribute to the nodes
    for node in G.nodes():
        G.nodes[node]['pos'] = pos[node]

    # Create edge trace
    edge_trace = go.Scatter(
        x=[],
        y=[],
        line=dict(width=0.5, color='#888'),
        hoverinfo='none',
        mode='lines'
    )

    for edge in G.edges():
        x0, y0 = G.nodes[edge[0]]['pos']
        x1, y1 = G.nodes[edge[1]]['pos']
        edge_trace['x'] += (x0, x1, None)
        edge_trace['y'] += (y0, y1, None)

    # Create node trace
    node_trace = go.Scatter(
        x=[],
        y=[],
        text=[],
        mode='markers',
        hoverinfo='text',
        marker=dict(
            showscale=True,
            colorscale='OrRd',
            color=[],
            size=[],
            sizemode='diameter',
            sizeref=3*max([G.nodes[node]['size'] for node in G.nodes()])/(30.**2),
            sizemin=4,
            colorbar=dict(
                thickness=15,
                title='# Citations',
                xanchor='left',
                titleside='right'
            ),
            line=dict(width=2)
        )
    )

    x = []
    y = []
    text = []
    marker_size = []
    marker_color = []

    for node in G.nodes():
        x_, y_ = G.nodes[node]['pos']
        x.append(x_)
        y.append(y_)

        if 'label' in G.nodes[node]:
            label = G.nodes[node]['label']
            text.append(label)
        elif 'Title' in G.nodes[node] and 'Authors' in G.nodes[node] and 'Year' in G.nodes[node]:
            title = G.nodes[node]['Title']
            authors = G.nodes[node]['Authors']
            year = G.nodes[node]['Year']
            formatted_authors = format_authors(authors)
            text.append(f"<b>{title}</b>{formatted_authors} ({year})")
        elif 'Source' in G.nodes[node] and label_type == 'journal':
            text.append(G.nodes[node]['Source'])
        else:
            text.append(node)

        marker_size.append(G.nodes[node]['size'])
        marker_color.append(G.nodes[node]['size'])  # Set color based on number of citations

    node_trace['x'] = tuple(x)
    node_trace['y'] = tuple(y)
    node_trace['text'] = tuple(text)
    node_trace['hovertemplate'] = '%{text}<extra></extra>'
    node_trace['marker']['size'] = tuple(marker_size)
    node_trace['marker']['color'] = tuple(marker_color)

    # Create a figure
    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(
                        title=title,
                        titlefont=dict(size=16),
                        showlegend=False,
                        hovermode='closest',
                        autosize=True,  # Automatically adjust the size of the graph
                        margin=dict(b=20, l=5, r=5, t=40),
                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        annotations=[
                            dict(
                                x=0,
                                y=0,
                                showarrow=False,
                                text=footnote2,
                                xref="paper",
                                yref="paper",
                                xanchor='left'
                            ),
                            dict(
                                x=0,
                                y=1,
                                showarrow=False,
                                text=footnote1,
                                xref="paper",
                                yref="paper",
                                xanchor='left'
                            )
                        ]
                    ))

    # Show the figure
    # fig.show()
    return fig

# Read the most cited papers from the CSV file
most_cited_papers = read_papers_from_csv(r"C:\Users\GODLEWSKI\OneDrive - unistra.fr\EDITORIAL & REFEREEING\Revue FINANCE\ranking\PoP\PopCites_.csv")

# Create a graph for the most cited papers
G1 = nx.Graph()
for paper in most_cited_papers:
    if '\ufeffCites' in paper and paper['\ufeffCites'].isdigit():
        if 'Authors' in paper and 'Year' in paper:
            G1.add_node(paper['Title'], size=int(paper['\ufeffCites']), authors=paper['Authors'], year=paper['Year'])
            G1.nodes[paper['Title']]['label'] = f"{paper['Title']}\n{paper['Authors']}\n({paper['Year']})"
        else:
            print(f"Skipping paper due to missing 'Authors' or 'Year': {paper}")
    else:
        print(f"Skipping paper: {paper}")

# Draw the graph
# if len(G1) > 0:
#     plt.figure(figsize=(10, 10))

    # Create custom labels
#     labels = {node: f"{node}\n{data['authors']}\n{data['year']}" for node, data in G1.nodes(data=True)}

#     nx.draw(G1, with_labels=True, labels=labels, node_size=[attr['size'] for _, attr in G1.nodes(data=True)])
#     plt.show()
# else:
#     print("No nodes to draw in graph G1")

# Read the citing papers from the CSV file
#all_citing_papers = read_papers_from_csv('AllCitingPapers.csv') #all citing papers
all_citing_papers = read_papers_from_csv('Top3CitingPapersPerSource.csv') #top 3 papers only

# Create a graph for the citing papers
G2 = nx.Graph()
for paper in all_citing_papers:
    if 'Cites' in paper and paper['Cites'].isdigit() and 'Source' in paper:
        G2.add_node(paper['Source'], size=int(paper['Cites']))
    else:
        print(f"Skipping paper: {paper}")

# Draw the graph
# if len(G2) > 0:
#     plt.figure(figsize=(10, 10))
#     nx.draw(G2, with_labels=True, node_size=[attr['size'] for _, attr in G2.nodes(data=True)])
#     plt.show()
# else:
#     print("No nodes to draw in graph G2")

# Convert NetworkX graphs to Plotly and display them
fig1 = networkx_to_plotly(G1,
                           'Network graph of citation data for most cited papers (2008-2023) in <a href="https://www.cairn-int.info/journal-finance.htm">Revue Finance</a> [ISSN 0752-6180]',
                           'author',
                           'To interact with the graph, you can select a portion of the screen to zoom in, hover over nodes to see details, and drag the graph to move around. Use the toolbar at the top right corner to zoom in/out, pan, or reset the view.',
                           'Generated by <a href=\"https://cgodlewski.github.io\">C. Godlewski</a>. [Based on Google Scholar data from Harzing PoP. 21 most cited papers (with h-markers) from Revue Finance (2008-2023).]')
fig2 = networkx_to_plotly(G2,
                           'Network graph of citation data for top citing journals of most cited papers (2008-2023) in <a href="https://www.cairn-int.info/journal-finance.htm">Revue Finance</a> [ISSN 0752-6180]',
                           'journal',
                           'To interact with the graph, you can select a portion of the screen to zoom in, hover over nodes to see details, and drag the graph to move around. Use the toolbar at the top right corner to zoom in/out, pan, or reset the view.',
                           'Generated by <a href=\"https://cgodlewski.github.io\">C. Godlewski</a>. [Based on Google Scholar data from Harzing PoP. Top 3 most cited citing papers (with h-markers) for 21 most cited papers (2008-2023) in <a href="https://www.cairn-int.info/journal-finance.htm">Revue Finance</a> [ISSN 0752-6180].]')

# Plot the figures
plot(fig1, filename='RF_network1.html')
plot(fig2, filename='RF_network2.html')


'RF_network2.html'

In [None]:
pip install networkx

In [None]:
pip install plotly

### Cleaning data and generating Interactive visualisation 
(with paper title, authors, year for RF papers and journal names & citations score for citing works)

In [4]:
import pandas as pd
import networkx as nx
import plotly.graph_objs as go
from plotly.offline import plot

# Load the data
base_papers = pd.read_csv('UnifiedPapers.csv')
edges = pd.read_csv('Edges.csv')

# Function to standardize author names
def standardize_authors(name):
    # Remove asterisks and standardize the format
    name = name.replace('*', '').strip()  # Strip to remove any leading/trailing whitespace
    name = name.replace('1', '')
    name = name.replace('2', '')
    name = name.replace('3', '')
    # split by spaces
    parts = name.split()
    # Remove middle initials and suffixes if needed
    # Reorder to 'Lastname, Firstname' if needed
    return ' '.join(parts)

# Apply the standardization function to the Authors column
base_papers['Authors'] = base_papers['Authors'].apply(standardize_authors)

def standardize_journal(name):
    # Check if 'name' is a string instance to avoid AttributeError
    if isinstance(name, str) and not pd.isna(name):
        # replace '&' with 'and', remove punctuation, etc.
        name = name.replace('&', 'and ').replace('.', '').replace(',', '').strip()
        # Use consistent abbreviations
        # Example: 'journal of' -> 'j'
        # name = name.replace('journal of', 'j')
        return name
    else:
        # If 'name' is not a string (e.g., NaN), return it as is
        return None

# Apply the standardization function to the Source column
base_papers['Source'] = base_papers['Source'].apply(standardize_journal)
base_papers = base_papers.dropna(subset=['Source'])

# Filter out citing papers with an empty Source
edges = edges[edges['Citing Paper Unique ID'].isin(base_papers[base_papers['Source'] != '']['Unique ID'])]

# Initialize a graph
G = nx.Graph()

# Function to format the authors string
def format_authors(authors_list):
    if len(authors_list) > 2:
        return f"{authors_list[0]} et al."
    else:
        return ', '.join(authors_list)

# Add nodes with labels, sizes, and additional details for base papers
for _, row in base_papers.iterrows():
    authors = row['Authors'].split(', ')  # Assuming authors are separated by semicolons
    authors_formatted = format_authors(authors)
    label = f"<b>{row['Title']}</b>\n{authors_formatted} ({row['Year']})"
    G.add_node(row['Unique ID'], label=label, size=row['Cites'], type='cited')

# Add edges and labels for citing papers
for _, edge in edges.iterrows():
    citing_paper = base_papers.loc[base_papers['Unique ID'] == edge['Citing Paper Unique ID']].iloc[0]
    label = f"{citing_paper['Source']} ({citing_paper['Cites']} citations)"
    G.add_node(edge['Citing Paper Unique ID'], label=label, size=citing_paper['Cites'], type='citing')
    G.add_edge(edge['Citing Paper Unique ID'], edge['Cited Paper Unique ID'])

# Generate positions for each node using a layout algorithm
pos = nx.spring_layout(G, seed=24)

# Create edge trace
edge_trace = go.Scatter(
    x=[],
    y=[],
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines')

for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_trace['x'] += (x0, x1, None)
    edge_trace['y'] += (y0, y1, None)

# Create node trace
node_trace = go.Scatter(
    x=[],
    y=[],
    text=[],
    mode='markers',
    hoverinfo='text',
    marker=dict(
        showscale=False,  # Disable / Enable the color scale legend
        colorscale='OrRd',  # Color scale for the nodes
        size=[],  # Sizes of the nodes
        color=[],  # Colors of the nodes
        opacity=[],  # Adjusted opacity for better visibility
        line=dict(width=2)))

for node in G.nodes():
    x, y = pos[node]
    node_trace['x'] += (x,)
    node_trace['y'] += (y,)
    size = G.nodes[node]['size']
    if G.nodes[node]['type'] == 'citing':
        size *= 1  # Adjust the size of citing nodes 
        opacity = 0.2  # More transparent for citing nodes
        color = 'orange'
    else:
        opacity = 0.8  # Less transparent for cited nodes
        color = 'firebrick'
    node_trace['marker']['size'] += (size,)
    node_trace['marker']['color'] += (color,)
    node_trace['marker']['opacity'] += (opacity,)
    node_trace['text'] += (G.nodes[node]['label'],)

# Create the figure
fig = go.Figure(data=[edge_trace, node_trace],
                layout=go.Layout(
                    title='Network graph of citation data for top cited papers (2008-2023) in <a href="https://www.cairn-int.info/journal-finance.htm">Revue Finance</a> [ISSN 0752-6180]',
                    titlefont_size=20,
                    showlegend=False,
                    hovermode='closest',
                    autosize=True,  # Automatically adjust the size of the graph
                    margin=dict(b=20, l=5, r=5, t=40),
                    annotations=[
                        #dict(
                        #    text='This graph represents the citation network of (selected) papers published in the Revue Finance (RF). '
                        #         'Each circle (node) in the graph represents a paper. '
                        #         'The firebrick circles represent papers that were published in RF. '
                        #         '(the titles of the papers are in bold).<br>'
                        #         'The size of each firebrick circle corresponds to the number of times that paper has been cited by other papers. '
                        #         'The orange circles represent papers that have cited the papers from RF. '
                        #        '(only the names of the journals where they were published are indicated).<br>'
                        #         'The size of each orange circle corresponds to the number of times that citing paper itself has been cited.<br>'
                        #         'In simpler terms, the bigger the firebrick circle, the more influential that paper has been within the RF. '
                        #         'The bigger the orange circle, the more influential that paper has been outside of RF.'
                        #         ,
                        #    showarrow=False,
                        #    xref='paper', yref='paper',
                        #    x=0.001, y=0.97,  # Position of the annotation box
                        #    font=dict(
                        #        size=11,  # Font size
                        #        color='darkblue',  # Font color
                        #        family='Courier New'  # Font family
                        #    ),
                        #    align='left',
                        #    bgcolor='white',
                        #    bordercolor='black',
                         #   borderwidth=1,
                        #    ),
                        dict(
                            text="Generated by <a href=\"https://cgodlewski.github.io\">C. Godlewski</a>. [Based on Google Scholar data from Harzing PoP. 21 most cited papers (with h-markers) from Revue Finance (2008-2023) and top 3 most cited citing papers (with h-markers).]",
                            showarrow=False,
                            xref="paper", yref="paper",
                            x=0.005, y=-0.002,
                             font=dict(
                                size=10,  # Font size
                                color='black',  # Font color
                                family='Arial'  # Font family
                            )),
                        dict(
                            text="To interact with the graph, you can select a portion of the screen to zoom in, hover over nodes to see details, and drag the graph to move around. Use the toolbar at the top right corner to zoom in/out, pan, or reset the view.",
                            showarrow=False,
                            xref="paper", yref="paper",
                            x=0.001, y=0.99,
                            font=dict(
                                size=12,  # Font size
                                #color='firebrick',  # Font color
                                family='Courier New'  # Font family
                            )),
                        dict(
                            text="The firebrick circles represent the citations for papers that were published in Revue Finance. The orange circles represent the citations for papers that have cited the papers from Revue Finance.",
                            showarrow=False,
                            xref="paper", yref="paper",
                            x=0.001, y=0.97,
                            font=dict(
                                size=12,  # Font size
                                #color='firebrick',  # Font color
                                family='Courier New'  # Font family
                            ))
                    ],
                    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)))

# Plot the figure
#plot(fig, filename='RF_citations_network.html') #file with 21 most cited papers in RF
plot(fig, filename='RF_citations3_network.html') #file with 21 most cited papers in RF by top 3 most cited citing papers

'RF_citations3_network.html'