In [None]:
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
# PRT820: THE INFLUENCE OF POST-PUBLICATION CORRESPONDENCE ON RESEARCH PAPERS                #
# STUDENT: ANNE TA - S359453                                                                 #
# Code Objective: Using Zero-Shot Classification to categorize PPC topics                    #
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

In [3]:
#-------------------------------------------------------------
# DEFINE BASIC FUNCTION FOR THE READING DATA FROM FILE
#-------------------------------------------------------------
import pandas as pd
#=======================================
# Define global variables for file path
#=======================================
def get_var(var_name):
    variable_filename = "variable/variable.txt"
    # Read the text file
    with open(variable_filename, 'r') as file:
        lines = file.readlines()

    # Initialize a dictionary to store the variables
    variables = {}

    # Process each line in the file
    for line in lines:
        # Split each line into variable name and value
        parts = line.strip().split(',')
        if len(parts) == 2:
            # Store the variable name and value in the dictionary
            variables[parts[0].strip()] = parts[1].strip()

    return variables[var_name]

#================================================================
# Define a function to read data from a CSV file into a DataFrame
#================================================================
def read_csv(filename, ec='ISO-8859-1'):
    try:
        # Load CSV data into DataFrame
        data_df = pd.read_csv(filename, encoding=ec)
        return data_df
    
    # Handle the case where the file is not found
    except FileNotFoundError:
        print("File not found. Please check the file path.")
        
    # Handle any other exceptions that might occur during reading the CSV file
    except Exception as e:
        print("An error occurred:", e)

In [None]:
from transformers import pipeline
import pandas as pd
import re

# Initialize the pipeline classification based on mode facebook/bart-large-mnli
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")



In [6]:
# Get file path of raw data
raw_data_filepath = get_var('raw_data_filepath')
# Read file paths from a CSV file into a DataFrame
ppc_article_df = read_csv(raw_data_filepath)
ppc_df = ppc_article_df.copy()
ppc_df = ppc_df[['PPC_DOI', 'Year', 'Abstract']]
ppc_df.columns = ['DOI', 'PPC_Year', 'Abstract']
print("Count records: ", len(ppc_df))
ppc_df = ppc_df.drop_duplicates()
print("After removing duplication: ", len(ppc_df))


Count records:  300
After removing duplication:  300


In [None]:
import concurrent.futures

# Use 'ID' and 'sequence' columns for classification
ids_and_sequences = zip(ppc_df['DOI'].tolist(), ppc_df['PPC_Year'].tolist(), ppc_df['Abstract'].tolist())

# Define candidate labels
candidate_labels = ['agree', 'disagree', 'clarification', 'question', 'recommendation']

# Function to exclude text starting with "©"
def exclude_copyright(text):
    return re.sub(r'©.*$', '', text)

# Function to classify sequences
def classify_sequence(doi, title, sequence_to_classify):
    sequence_to_classify = exclude_copyright(sequence_to_classify)
    outputs = classifier(sequence_to_classify, candidate_labels)
    label_scores = dict(zip(outputs['labels'], outputs['scores']))
    label_scores['DOI'] = doi
    label_scores['PPC_Year'] = title
    label_scores['Abstract'] = sequence_to_classify
    return label_scores

# Classify sequences using multithreading
results = []
with concurrent.futures.ThreadPoolExecutor() as executor:
    future_to_doi_title_seq = {executor.submit(classify_sequence, doi, title, seq): (doi, title, seq) for doi, title, seq in ids_and_sequences}
    for future in concurrent.futures.as_completed(future_to_doi_title_seq):
        doi, title, sequence = future_to_doi_title_seq[future]
        try:
            label_scores = future.result()
            results.append(label_scores)
        except Exception as e:
            print(f"Failed to classify sequence for {doi}: {e}")

# Create a DataFrame from the results
final_df = pd.DataFrame(results)

# Reorder columns to match the desired structure
final_df = final_df[['DOI', 'PPC_Year', 'Abstract', 'agree', 'clarification', 'question', 'recommendation', 'disagree']]

# Store the final results to a CSV file
final_df.to_csv("result/ppc_abstract_topic.csv", index=False)