In [None]:
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
# PRT820: THE INFLUENCE OF POST-PUBLICATION CORRESPONDENCE ON RESEARCH PAPERS                #
# STUDENT: ANNE TA - S359453                                                                 #
# Code Objective: Filter data based on OA & PPC publish year for the average citation before #
# and after PPC                                                                              # 
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

In [2]:
#-------------------------------------------------------------
# DEFINE BASIC FUNCTION FOR THE READING DATA FROM FILE
#-------------------------------------------------------------
import pandas as pd
#=======================================
# Define global variables for file path
#=======================================
def get_var(var_name):
    variable_filename = "variable/variable.txt"
    # Read the text file
    with open(variable_filename, 'r') as file:
        lines = file.readlines()

    # Initialize a dictionary to store the variables
    variables = {}

    # Process each line in the file
    for line in lines:
        # Split each line into variable name and value
        parts = line.strip().split(',')
        if len(parts) == 2:
            # Store the variable name and value in the dictionary
            variables[parts[0].strip()] = parts[1].strip()

    return variables[var_name]

#================================================================
# Define a function to read data from a CSV file into a DataFrame
#================================================================
def read_csv(filename, ec='ISO-8859-1'):
    try:
        # Load CSV data into DataFrame
        data_df = pd.read_csv(filename, encoding=ec)
        return data_df
    
    # Handle the case where the file is not found
    except FileNotFoundError:
        print("File not found. Please check the file path.")
        
    # Handle any other exceptions that might occur during reading the CSV file
    except Exception as e:
        print("An error occurred:", e)

In [3]:
#-------------------------------------------------------------
# MERGE CITATION OF ORIGINAL DATA WITH FINAL DATA
#-------------------------------------------------------------

# Get file path from variable
ppc_article_citation_filepath = get_var('ppc_article_citation_filepath')
ppc_article_citation_yearfilter_filepath = get_var('ppc_article_citation_yearfilter_filepath')
ppc_article_filepath = get_var('raw_data_filepath')
article_citation_filepath = get_var('article_citation_filepath')

#==========================================================================================================
# STEP 1: Run basic function to read data from csv file
# citation_df: citation data includes original article (OA) DOI, citation year and citation count
# findata_df: ppc data both PPC DOI and OA DOI
#==========================================================================================================
# Read original articles' citation data from csv file
citation_df = read_csv(article_citation_filepath)
print(citation_df.info())
# Filter out records where Citation count is 0
citation_df = citation_df[citation_df['CitationCount'] != 0]

# Read ppc dataset from csv file
findata_df = read_csv(ppc_article_filepath)
findata_df = findata_df[['Journal','PPC_DOI','Year', 'DOI_OA', 'Year_OA']]

#==========================================================================================================
# STEP 2: Merge citation data back to ppc data
#==========================================================================================================
merged_citation_df = pd.merge(findata_df, citation_df, left_on='DOI_OA', right_on='DOI_OA', how='left', suffixes=('', '_CIT'))
print("before dropping duplicate: ", len(merged_citation_df))
merged_citation_df = merged_citation_df.drop_duplicates()
print("after dropping duplicate: ", len(merged_citation_df))


#==========================================================================================================
# STEP 2: Define time span to calculate Average Citation of an article before and after PPC
# This step also generates a new feature 'Article Age At PPC' is the duration between OA and PPC
#==========================================================================================================
merged_citation_df["Article_Age_At_PPC"] = merged_citation_df["Year"] - merged_citation_df["Year_OA"]
merged_citation_df["GAP_Year"] = merged_citation_df["Article_Age_At_PPC"] +1
count_year = merged_citation_df["GAP_Year"]*2-1

# Calculate PPC publication year
merged_citation_df['PPC_Publication_Year'] = merged_citation_df['Year_OA'] + count_year
merged_citation_df.to_csv('test.csv', index=False)
# Define the range of years to keep (from Year_OA to 2 years after PPC publication)
merged_citation_df['Lower_Year_Limit'] = merged_citation_df['Year_OA']
merged_citation_df['Upper_Year_Limit'] = merged_citation_df['PPC_Publication_Year']

print(merged_citation_df.info())
# Group by original article (DOI_OA) and calculate the combined year range
ppc_grouped_df = merged_citation_df.groupby('DOI_OA').agg({
    'Lower_Year_Limit': 'min',  # Minimum Lower Year Limit for the article
    'Upper_Year_Limit': 'max'   # Maximum Upper Year Limit based on all PPCs for the article
}).reset_index()

#Merge the combined year limits back to the original citation data
merged_citation_df = pd.merge(merged_citation_df, ppc_grouped_df, on='DOI_OA', suffixes=('', '_Combined'))

#==========================================================================================================
# STEP 3: Filter the records to keep only those within the time span defined in step 2
#==========================================================================================================
filtered_ppc_data = merged_citation_df[
    (merged_citation_df['Year_CIT'] >= merged_citation_df['Lower_Year_Limit_Combined']) &
    (merged_citation_df['Year_CIT'] <= merged_citation_df['Upper_Year_Limit_Combined'])
]

# Keep only the necessary columns
filtered_ppc_data = filtered_ppc_data[['Journal','PPC_DOI', 'Year', 'DOI_OA', 'Year_OA', 'Year_CIT', 'CitationCount', 'Article_Age_At_PPC']]

#==========================================================================================================
# STEP 4: Store the citation data to csv file
#==========================================================================================================
filtered_ppc_data.to_csv(ppc_article_citation_yearfilter_filepath, index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13884 entries, 0 to 13883
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Year           13884 non-null  int64 
 1   CitationCount  13884 non-null  int64 
 2   DOI_OA         13884 non-null  object
dtypes: int64(2), object(1)
memory usage: 325.5+ KB
None
before dropping duplicate:  5125
after dropping duplicate:  5125
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5125 entries, 0 to 5124
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Journal               5125 non-null   object
 1   PPC_DOI               5125 non-null   object
 2   Year                  5125 non-null   int64 
 3   DOI_OA                5125 non-null   object
 4   Year_OA               5125 non-null   int64 
 5   Year_CIT              5125 non-null   int64 
 6   CitationCount         5125 non-null   int64

In [4]:
# Check the number of records
filtered_ppc_data['PPC_DOI'].nunique()

300

In [5]:
# Check the content of final data
filtered_ppc_data

Unnamed: 0,Journal,PPC_DOI,Year,DOI_OA,Year_OA,Year_CIT,CitationCount,Article_Age_At_PPC
0,PNAS,10.1073/pnas.1606615113,2016,10.1073/pnas.1519019113,2016,2016,22,0
1,PNAS,10.1073/pnas.1606615113,2016,10.1073/pnas.1519019113,2016,2017,38,0
9,PNAS,10.1073/pnas.1614961114,2017,10.1073/pnas.1602413113,2016,2016,45,1
10,PNAS,10.1073/pnas.1614961114,2017,10.1073/pnas.1602413113,2016,2017,430,1
11,PNAS,10.1073/pnas.1614961114,2017,10.1073/pnas.1602413113,2016,2018,436,1
...,...,...,...,...,...,...,...,...
5074,Nature,10.1038/s41586-022-04627-y,2014,10.1038/nature13429,2014,2014,2,0
5075,Nature,10.1038/s41586-022-04627-y,2014,10.1038/nature13429,2014,2015,18,0
5085,Nature,10.1038/296870a0,1982,10.1038/294125a0,1981,1982,1,1
5086,Nature,10.1038/296870a0,1982,10.1038/294125a0,1981,1983,4,1
