In [None]:
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
# PRT820: THE INFLUENCE OF POST-PUBLICATION CORRESPONDENCE ON RESEARCH PAPERS                #
# STUDENT: ANNE TA - S359453                                                                 #
# Code Objective: Develop Influence Indicator for each PPC based on average citation count   # 
# before and after PPC                                                                       #
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

In [3]:
#-------------------------------------------------------------
# DEFINE BASIC FUNCTION FOR THE READING DATA FROM FILE
#-------------------------------------------------------------
import pandas as pd
#=======================================
# Define global variables for file path
#=======================================
def get_var(var_name):
    variable_filename = "variable/variable.txt"
    # Read the text file
    with open(variable_filename, 'r') as file:
        lines = file.readlines()

    # Initialize a dictionary to store the variables
    variables = {}

    # Process each line in the file
    for line in lines:
        # Split each line into variable name and value
        parts = line.strip().split(',')
        if len(parts) == 2:
            # Store the variable name and value in the dictionary
            variables[parts[0].strip()] = parts[1].strip()

    return variables[var_name]

#================================================================
# Define a function to read data from a CSV file into a DataFrame
#================================================================
def read_csv(filename, ec='ISO-8859-1'):
    try:
        # Load CSV data into DataFrame
        data_df = pd.read_csv(filename, encoding=ec)
        return data_df
    
    # Handle the case where the file is not found
    except FileNotFoundError:
        print("File not found. Please check the file path.")
        
    # Handle any other exceptions that might occur during reading the CSV file
    except Exception as e:
        print("An error occurred:", e)

In [9]:
#-------------------------------------------------------------
# FEATURE ENGINEERING
# - Average Citation before PPC
# - Average Citation after PPC
#-------------------------------------------------------------

# Get file path from variable
ppc_article_citation_filepath = get_var('ppc_article_citation_yearfilter_filepath')
ppc_article_citation_count_yearfilter_filepath = get_var('ppc_article_citation_count_yearfilter_filepath')

#==========================================================================================================
# STEP 1: Read ppc data which is already filtered with time span
#==========================================================================================================
raw_df = read_csv(ppc_article_citation_filepath)

# Replicate data to another dataframe before proceed the next steps
data_df = raw_df.copy()

# Select data for the feature engineering purpose
data_df = data_df[['Journal','PPC_DOI', 'DOI_OA', 'Year_OA', 'Year', 'Year_CIT', 'CitationCount', 'Article_Age_At_PPC']].drop_duplicates()

#==========================================================================================================
# STEP 2: Filter the citation data before PPC (from Year_OA to Year of PPC)
#==========================================================================================================
# Calculate the total number of years to be divided for each OA by dividing the citation period by 2
data_df['Year_Range'] = (data_df['Year'] - data_df['Year_OA'] + 1)
data_df.to_csv("test.csv",index=False)

# Filter the citation data before PPC (from Year_OA to Year of PPC)
df_before_ppc = data_df[data_df['Year_CIT'] <= data_df['Year']]
df_before_ppc.to_csv("test1.csv",index=False)

#==========================================================================================================
# STEP 3: Calculate the average citation count before PPC for each OA and PPC pair
#==========================================================================================================
# Sum all citation count for the period before PPC
average_citation_before = df_before_ppc.groupby(['Journal','DOI_OA', 'PPC_DOI', 'Year', 'Year_OA', 'Year_Range', 'Article_Age_At_PPC']).agg({
    'CitationCount': 'sum'
}).reset_index()

# Calculate average citation count for the period before PPC
average_citation_before['Average_Citation_Before'] = average_citation_before['CitationCount'] / average_citation_before['Year_Range']

#==========================================================================================================
# STEP 4: Calculate the citation data after PPC (from the year after PPC to max(Year_CIT))
#==========================================================================================================
# Filter data for citation after PPC
df_after_ppc = data_df[data_df['Year_CIT'] > data_df['Year']]

# Sum all citation count for the period after PPC
average_citation_after = df_after_ppc.groupby(['Journal','DOI_OA', 'PPC_DOI', 'Year', 'Year_OA', 'Year_Range', 'Article_Age_At_PPC']).agg({
    'CitationCount': 'sum'
}).reset_index()

# Calculate average citation count for the period after PPC
average_citation_after['Average_Citation_After'] = average_citation_after['CitationCount'] / average_citation_after['Year_Range']

#==========================================================================================================
# STEP 5: Merge the results for before and after PPC
#==========================================================================================================
# Merge two dataframe
average_citations = pd.merge(average_citation_before,
                             average_citation_after,
                             on=['Journal', 'DOI_OA', 'PPC_DOI', 'Year', 'Year_OA', 'Year_Range',
       'Article_Age_At_PPC'], how='outer',suffixes=('_beforePPC', '_afterPPC'))


# Select the final data for average citations before and after PPC
average_citations = average_citations[['Journal', 'PPC_DOI', 'Year', 'DOI_OA', 'Year_OA', 
       'Article_Age_At_PPC', 'Average_Citation_Before', 'Average_Citation_After']]
average_citations['PPC_DOI'].nunique()

#==========================================================================================================
# STEP 6: Handle the case missing data then develop new feature Influence Indicator
#==========================================================================================================
# Replace NaN values with 0 in the DataFrame
average_citations.fillna(0, inplace=True)

# Define the influence indicator based on the comparison
average_citations['Influence'] = average_citations.apply(
    lambda row: 0 if row['Average_Citation_Before'] >= row['Average_Citation_After'] else 1,
    axis=1
)
# average_citations.isnull().sum()


#==========================================================================================================
# STEP 7: Merge data with abstract classification
#==========================================================================================================
# Read PPC with abstract classification
ppc_abstract_topic_filepath = get_var('ppc_abstract_topic_filepath')
raw_clf_abstract_df = read_csv(ppc_abstract_topic_filepath)
raw_clf_abstract_df.columns

# Rename columns and select specific columns
raw_clf_abstract_df.columns = ['DOI', 'PPC_Year', 'Abstract', 'Agree', 'Clarification', 'Question', 'Recommendation', 'Disagree']


# Merge two data together to have analysed data
analysingdata_df = pd.merge(average_citations, raw_clf_abstract_df, left_on='PPC_DOI', right_on='DOI', how='left', suffixes=('', '_COM'))
analysingdata_df = analysingdata_df[['Journal', 'PPC_DOI', 'Year', 'DOI_OA', 'Year_OA', 'Article_Age_At_PPC',
       'Average_Citation_Before', 'Average_Citation_After', 'Agree', 'Clarification', 'Question',
       'Recommendation', 'Disagree', 'Influence']]
analysingdata_df['PPC_DOI'].nunique()

#==========================================================================================================
# STEP 8: Store the citation data to csv file
#==========================================================================================================
ppc_analysing_data_yearfilter_filepath = get_var('ppc_analysing_data_yearfilter_filepath')
analysingdata_df.to_csv(ppc_analysing_data_yearfilter_filepath, index=False)