# Import dependencies

In [1]:
import pandas as pd
import os
import glob

# Sort raw Scopus data

* Consolidate different years into one csv file
* Remove redundant columns
* Add large data, e.g. abstracts, to a separate csv file

__Add abstracts to separate csv__

In [2]:
os.chdir(os.getcwd() + '\\scopus')

In [3]:
def get_csvs():
    return glob.glob('*.csv')
    

def create_abstract_df(csv_files):
    
    # Load only doi and abstract from each csv
    dfs = []
    
    for csv in csv_files:
        dfs.append(pd.read_csv(csv, usecols=['eid', 'doi', 'description']))
    
    abstract_df = pd.concat(dfs, ignore_index=True)
    
    return abstract_df


csv_files = get_csvs()
abstract_df = create_abstract_df(csv_files)

In [5]:
os.chdir('..')
os.chdir(os.getcwd() + '\\final_data')
abstract_df.to_csv('abstracts.csv')

__Remove redundant columns__

Keep: eid, doi, title, afid (separated by semi-colon), coverDate, publicationName, citedby_count.

Could this method artificially increase the number of records? If there are two authors from the same institution working on the same paper, this will show up twice in the records. Should I therefore remove duplicates?

The above is not true! AFIDs are listed only once if there are multiple authors from the same institution.

In [6]:
os.chdir('..')
os.chdir(os.getcwd() + '\\scopus')

The problem with the below methodology is that I think I'm exploding non-UK affiliations. Also, there are some papers that were thrown out due to e.g. inconsistent lengths of afid and list of names. 

Is it actually expanding all affiliations? Going from papers to paper-afid only doubles the number of entries.

In [7]:
def remove_columns(csv_files):
    
    dfs = []
    
    for csv in csv_files:
        columns = ['eid', 'doi', 'title', 'afid', 'coverDate', 'source_id', 'publicationName', 'citedby_count']
        df = pd.read_csv(csv, usecols=columns)
        df['afid'] = df['afid'].str.split(';')
        df = df.explode('afid').reset_index(drop=True)
        dfs.append(df)
    
    df = pd.concat(dfs, ignore_index=True)
    
    return df
        
        
df = remove_columns(csv_files)

In [8]:
os.chdir('..')
os.chdir(os.getcwd() + '\\final_data')

In [9]:
df.to_csv('papers.csv')

# Sort affiliations

Not needed