In [1]:
import pandas as pd
from numpy import nan
import numpy as np
import re

In [2]:
def openData():
    """
    Opens the csv with the data and cleans the dataset and returns a dataframe.
    """
    # Sort data in descending order by page visibility. This ensures duplicates without page followers are dropped.
    data = pd.read_csv("data/output_facebook_data.csv").sort_values('visibility', ascending=False)
    data = data.drop_duplicates(['altmetric_id'], keep='first')
    
    data['fb_wall_urls'] = data['fb_wall_urls'].apply(lambda x: eval(x))
    
    def func(inp):
        if inp is nan:
            return inp
        try:
            res = eval(inp)
        except:
            res = [inp]
        return res
    
    data['subjects'] = data['subjects'].map(func)
    data['scopus_subjects'] = data['scopus_subjects'].map(func)
    data['publisher_subjects'] = data['publisher_subjects'].map(func)
    
    data['fb_wall_count'] = data['fb_wall_count'].astype(int)
    
    return data


def makePageData(df):
    """
    Takes the dataframe of articles and makes a new dataset of pages.
    """
    df['fb_wall_count'] = df['fb_wall_count'].astype(int)
    res_dict = dict()
    
    for i in df.index.values:
        for j in range(len(df.loc[i, 'fb_wall_urls'])):
            try:
                page_id = re.search(r'fbid=(.+?)&', df['fb_wall_urls'][i][j]['link']).group(1)
            except:
                page_id = re.search(r'/(.+?)/', df['fb_wall_urls'][i][j]['link']).group(1)
            if page_id in res_dict:
                res_dict[page_id]['num_articles'] += 1
                res_dict[page_id]['articles'].append(df.loc[i, 'title'])
            else:
                res_dict[page_id] = {'page_id': page_id, 
                                     'num_articles': 1, 
                                     'articles': [df.loc[i, 'title']]}
    
    return pd.DataFrame(res_dict).transpose()


def allSubjects(df):
    """
    Makes a new column that is a list of all subjects associated with an article.
    """
    res = []
    
    for i in df.index.values:
        subj = []
        
        if df.loc[i, 'subjects'] is not nan:
            subj += df.loc[i, 'subjects']
        if df.loc[i, 'scopus_subjects'] is not nan:
            subj += df.loc[i, 'scopus_subjects']
        if df.loc[i, 'publisher_subjects'] is not nan:
            subj += df.loc[i, 'publisher_subjects']
        
        res.append(subj)
    return res
        

In [3]:
data = openData()

In [4]:
df = data[data['fb_wall_count'] > 5].sort_values(by='fb_wall_count', ascending=False)

In [5]:
df['all_subjects'] = allSubjects(df)

In [6]:
def makeNodes(data):
    """
    Takes the dataframe of all articles with their subjects and returns a list of subjects
    """
    res = dict()
    
    for i in data.index.values:
        
        for val in data.loc[i, 'all_subjects']:
            subj = val.lower()
            if subj in res:
                res[subj]['count'] += 1
                res[subj]['reactions'] += data.loc[i, 'total_love':'total_angry'].values
                res[subj]['shares'] += data.loc[i, 'shares']
            else:
                res[subj] = {'count': 1, 
                             'reactions': data.loc[i, 'total_love':'total_angry'].values, 
                             'shares':data.loc[i, 'shares']}
    
    nodes = pd.DataFrame()
    i = 0
    
    for subj in res:
        nodes = nodes.append({'ID': i, 
                              'Label': subj, 
                              'Count': res[subj]['count'], 
                              'shares': res[subj]['shares'],
                              'total_love': res[subj]['reactions'][0], 
                              'total_wow': res[subj]['reactions'][1],
                              'total_haha': res[subj]['reactions'][2],
                              'total_sad': res[subj]['reactions'][3],
                              'total_angry': res[subj]['reactions'][4],}, 
                             ignore_index=True)
        i+=1
        
    # Now add a column for the total number of reactions
    nodes['total_reactions'] = (nodes['total_love'] + nodes['total_wow'] + 
                                nodes['total_haha'] + nodes['total_sad'] + 
                                nodes['total_angry'])
    
    # Add columns for positive and negative reaction totals
    nodes['positive_reactions'] = nodes['total_love'] + nodes['total_wow']
    nodes['negative_reactions'] = nodes['total_sad'] + nodes['total_angry']
    
    # Add a calculated sentiment column
    nodes['sentiment'] = (nodes['positive_reactions'] - nodes['negative_reactions']) / nodes['total_reactions'] 

        
    return nodes.fillna(0)

def makeEdges(data, nodes_df):
    """
    Takes the dataframe of all articles with their subjects and returns an edge list of subjects.
    The nodes are given as a dictionary, where the key is the name and the value is the ID.
    """
    # Make nodes_df into a dictionary 
    node_dict = dict()
    node_dict_opp = dict()
    
    for i in nodes_df.index.values:
        node_dict[nodes_df.loc[i, 'Label']] = nodes_df.loc[i, 'ID']
        node_dict_opp[nodes_df.loc[i, 'ID']] = nodes_df.loc[i, 'Label']
        
    # Now make an edge list as dictionaries within dictionary
    edges = dict()
    
    for lis in data['all_subjects'].values:
        i = 0
        while i < len(lis):
            j = i+1
            
            while j < len(lis):
                source = node_dict[lis[i].lower()]
                dest   = node_dict[lis[j].lower()]
                
                if (source, dest) in edges:
                    edges[(source, dest)] += 1
                elif (dest, source) in edges:
                    edges[(dest, source)] +=1
                else:
                    edges[(source, dest)] = 1
                    
                j+=1
            i+=1
    
    # now we create the edge list as a dataframe 
    edge_list = pd.DataFrame()
    for nodes in edges:
        edge_list = edge_list.append({'Source': nodes[0], 
                                      'Target': nodes[1], 
                                      'Weight': edges[nodes], 
                                      'Source_name': node_dict_opp[nodes[0]], 
                                      'Target_name': node_dict_opp[nodes[1]], 
                                      'Type': 'Undirected'}, 
                                     ignore_index=True)

    return edge_list
                

In [23]:
def addLabelClasses(nodes):
    # We need to add class labels for the subject fields and super-class labels

    for i in nodes.index.values:
        subj_label = nodes.loc[i, 'Label']

        if ('health' in subj_label or 'medici' in subj_label or 
            'cardio' in subj_label or 'psychiatry' in subj_label or 
            'nursing' in subj_label or 'neuro' in subj_label or 
            'pharma' in subj_label or 'hospital' in subj_label or
            'ortho' in subj_label or 'clinical' in subj_label or 
            'pediatric' in subj_label or 'neur' in subj_label or 
            'autis' in subj_label or 'alzheim' in subj_label or 
            'diabet' in subj_label or 'inflamm' in subj_label or 
            'nephro' in subj_label or 'life sci' in subj_label or 
            'cancer' in subj_label or 'disord' in subj_label or 
            'neoplasm' in subj_label or 'oncol' in subj_label or 
            'therap' in subj_label or 'diseas' in subj_label or 
            'drug' in subj_label or 'sleep' in subj_label or 
            'immuno' in subj_label or 'physio' in subj_label or 
            'tumour' in subj_label or 'endocrin' in subj_label or 
            'metabol' in subj_label or 'patholog' in subj_label or 
            'brain' in subj_label or 'epidemio' in subj_label or 
            'surgery' in subj_label or 'ophthalmol' in subj_label or 
            'sports sci' in subj_label or 'gastroent' in subj_label or 
            'risk fact' in subj_label or 'obstet' in subj_label or 
            'veterin' in subj_label or 'perinat' in subj_label or 
            'allerg' in subj_label or 'geriatric' in subj_label or 
            'dyslipidaemias' in subj_label or 'rheumato' in subj_label or 
            'obesity' in subj_label or 'care' in subj_label or 
            'anesthes' in subj_label or 'radiol' in subj_label or 
            'sax' in subj_label or 'anxiety' in subj_label or 
            'diagnostic' in subj_label or 'otolaryng' in subj_label or 
            'dementia' in subj_label or 'embryology' in subj_label or 
            'derma' in subj_label or 'audio' in subj_label or 
            'viral' in subj_label):
            nodes.loc[i, 'subject_class'] = 'Medicine and Health'
            nodes.loc[i, 'subject_superclass'] = 'Applied Sciences'

        elif ('nutrition' in subj_label or 'food' in subj_label):
            nodes.loc[i, 'subject_class'] = 'Nutritional science'
            nodes.loc[i, 'subject_superclass'] = 'Applied Sciences'

        elif ('bio' in subj_label or 'zoolog' in subj_label or 
              'botany' in subj_label or 'animal' in subj_label or 
              'sequen' in subj_label or 'genet' in subj_label or 
              'ecolo' in subj_label or 'taxon' in subj_label or 
              'protein' in subj_label):
            nodes.loc[i, 'subject_class'] = 'Biology'
            nodes.loc[i, 'subject_superclass'] = 'Natural Sciences'

        elif ('chem' in subj_label):
            nodes.loc[i, 'subject_class'] = 'Chemistry'
            nodes.loc[i, 'subject_superclass'] = 'Natural Sciences'

        elif ('carbon' in subj_label or 'environ' in subj_label):
            nodes.loc[i, 'subject_class'] = 'Environmental sciences'
            nodes.loc[i, 'subject_superclass'] = 'Natural Sciences'

        elif ('behavior' in subj_label or 'psych' in subj_label or 
              'cognitive' in subj_label):
            nodes.loc[i, 'subject_class'] = 'Psychology'
            nodes.loc[i, 'subject_superclass'] = 'Social Sciences'

        elif ('archae' in subj_label):
            nodes.loc[i, 'subject_class'] = 'Archaeology'
            nodes.loc[i, 'subject_superclass'] = 'Social Sciences'

        elif ('social science' in subj_label or 'socialscience' in subj_label):
            nodes.loc[i, 'subject_class'] = 'Social Sciences'
            nodes.loc[i, 'subject_superclass'] = 'Social Sciences'

        elif ('econom' in subj_label or 'finance' in subj_label):
            nodes.loc[i, 'subject_class'] = 'Economics'
            nodes.loc[i, 'subject_superclass'] = 'Social Sciences'

        elif ('sociolog' in subj_label):
            nodes.loc[i, 'subject_class'] = 'Sociology'
            nodes.loc[i, 'subject_superclass'] = 'Social Sciences'

        elif ('planetary' in subj_label or 'cosmol' in subj_label 
              or 'astron' in subj_label):
            nodes.loc[i, 'subject_class'] = 'Astrophysics'
            nodes.loc[i, 'subject_superclass'] = 'Natural Sciences'

        elif ('atom' in subj_label or 'particle' in subj_label or 
              'physic' in subj_label or 'gravit' in subj_label):
            nodes.loc[i, 'subject_class'] = 'Physics'
            nodes.loc[i, 'subject_superclass'] = 'Natural Sciences'

        elif ('electr' in subj_label or 'engineer' in subj_label or 
              'material' in subj_label or 'mechanism' in subj_label):
            nodes.loc[i, 'subject_class'] = 'Engineering'
            nodes.loc[i, 'subject_superclass'] = 'Applied Sciences'

        elif ('artificial int' in subj_label or 'computer' in subj_label):
            nodes.loc[i, 'subject_class'] = 'Computer science'
            nodes.loc[i, 'subject_superclass'] = 'Formal Sciences'

        elif ('math' in subj_label):
            nodes.loc[i, 'subject_class'] = 'Mathematics'
            nodes.loc[i, 'subject_superclass'] = 'Formal Sciences'

        elif ('demograph' in subj_label):
            nodes.loc[i, 'subject_class'] = 'Statistics'
            nodes.loc[i, 'subject_superclass'] = 'Formal Sciences'

        elif ('educat' in subj_label):
            nodes.loc[i, 'subject_class'] = 'Education'
            nodes.loc[i, 'subject_superclass'] = 'Applied Sciences'

        elif ('politica' in subj_label):
            nodes.loc[i, 'subject_class'] = 'Political science'
            nodes.loc[i, 'subject_superclass'] = 'Social Sciences'
            
        else:
            nodes.loc[i, 'subject_class'] = 'general'
            nodes.loc[i, 'subject_superclass'] = 'general'


In [24]:
nodes = makeNodes(df)
addLabelClasses(nodes)

In [25]:
edges = makeEdges(df, nodes)

In [26]:
nodes

Unnamed: 0,Count,ID,Label,shares,total_angry,total_haha,total_love,total_sad,total_wow,total_reactions,positive_reactions,negative_reactions,sentiment,subject_class,subject_superclass
0,12.0,0.0,cardiology,1293.0,2.0,14.0,106.0,3.0,108.0,233.0,214.0,5.0,0.896996,Medicine and Health,Applied Sciences
1,322.0,1.0,medicine,47120.0,988.0,202.0,3117.0,1475.0,4558.0,10340.0,7675.0,2463.0,0.504062,Medicine and Health,Applied Sciences
2,363.0,2.0,health sciences,42632.0,779.0,210.0,3356.0,1010.0,3428.0,8783.0,6784.0,1789.0,0.568712,Medicine and Health,Applied Sciences
3,9.0,3.0,cardiovascular medicine and haematology,1050.0,2.0,14.0,88.0,3.0,108.0,215.0,196.0,5.0,0.888372,Medicine and Health,Applied Sciences
4,17.0,4.0,internalmedicine,1440.0,1.0,4.0,79.0,13.0,45.0,142.0,124.0,14.0,0.774648,Medicine and Health,Applied Sciences
5,45.0,5.0,"biochemistry, genetics and molecular biology",5764.0,38.0,26.0,447.0,143.0,300.0,954.0,747.0,181.0,0.593291,Biology,Natural Sciences
6,68.0,6.0,life sciences,13676.0,134.0,42.0,6869.0,334.0,952.0,8331.0,7821.0,468.0,0.882607,Medicine and Health,Applied Sciences
7,119.0,7.0,medical and health sciences,18877.0,440.0,66.0,1211.0,546.0,1734.0,3997.0,2945.0,986.0,0.490118,Medicine and Health,Applied Sciences
8,45.0,8.0,general,11048.0,4145.0,197.0,944.0,2879.0,1321.0,9486.0,2265.0,7024.0,-0.501687,general,general
9,13.0,9.0,publichealth,663.0,23.0,1.0,24.0,132.0,55.0,235.0,79.0,155.0,-0.323404,Medicine and Health,Applied Sciences


In [27]:
edges

Unnamed: 0,Source,Source_name,Target,Target_name,Type,Weight
0,0.0,cardiology,1.0,medicine,Undirected,10.0
1,0.0,cardiology,2.0,health sciences,Undirected,10.0
2,0.0,cardiology,3.0,cardiovascular medicine and haematology,Undirected,9.0
3,1.0,medicine,2.0,health sciences,Undirected,296.0
4,1.0,medicine,3.0,cardiovascular medicine and haematology,Undirected,9.0
5,2.0,health sciences,3.0,cardiovascular medicine and haematology,Undirected,9.0
6,1.0,medicine,5.0,"biochemistry, genetics and molecular biology",Undirected,46.0
7,1.0,medicine,6.0,life sciences,Undirected,51.0
8,1.0,medicine,1.0,medicine,Undirected,80.0
9,1.0,medicine,7.0,medical and health sciences,Undirected,151.0


In [28]:
(set(nodes.subject_class.values))

{'Archaeology',
 'Astrophysics',
 'Biology',
 'Chemistry',
 'Computer science',
 'Economics',
 'Education',
 'Engineering',
 'Environmental sciences',
 'Mathematics',
 'Medicine and Health',
 'Nutritional science',
 'Physics',
 'Political science',
 'Psychology',
 'Social Sciences',
 'Sociology',
 'Statistics',
 'general'}

In [29]:
(set(nodes.subject_superclass.values))

{'Applied Sciences',
 'Formal Sciences',
 'Natural Sciences',
 'Social Sciences',
 'general'}

In [13]:
nodes.to_csv("node_list.csv", index=False)
edges.to_csv("edge_list.csv", index=False)