# Data Preprocessing

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import scipy.sparse
import joblib
from collections import Counter
import re
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/brina/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/brina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/brina/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Parameters

In [2]:
# select as needed

dataset = "train"
# dataset = "test"

## 1. Load dataset

In [3]:
df = pd.read_json("scicite/" + dataset + ".jsonl", lines=True)

display(
    df.head(),
    df.shape
)

Unnamed: 0,source,citeEnd,sectionName,citeStart,string,label,label_confidence,citingPaperId,citedPaperId,isKeyCitation,id,unique_id,excerpt_index,label2,label2_confidence
0,explicit,175.0,Introduction,168.0,"However, how frataxin interacts with the Fe-S ...",background,1.0,1872080baa7d30ec8fb87be9a65358cd3a7fb649,894be9b4ea46a5c422e81ef3c241072d4c73fdc0,True,1872080baa7d30ec8fb87be9a65358cd3a7fb649>894be...,1872080baa7d30ec8fb87be9a65358cd3a7fb649>894be...,11,,
1,explicit,36.0,Novel Quantitative Trait Loci for Seminal Root...,16.0,"In the study by Hickey et al. (2012), spikes w...",background,1.0,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b,b6642e19efb8db5623b3cc4eef1c5822a6151107,True,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b>b6642...,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b>b6642...,2,,
2,explicit,228.0,Introduction,225.0,"The drug also reduces catecholamine secretion,...",background,1.0,9cdf605beb1aa1078f235c4332b3024daa8b31dc,4e6a17fb8d7a3cada601d942e22eb5da6d01adbd,False,9cdf605beb1aa1078f235c4332b3024daa8b31dc>4e6a1...,9cdf605beb1aa1078f235c4332b3024daa8b31dc>4e6a1...,0,,
3,explicit,110.0,Discussion,46.0,By clustering with lowly aggressive close kin ...,background,1.0,d9f3207db0c79a3b154f3875c9760cc6b056904b,2cc6ff899bf17666ad35893524a4d61624555ed7,False,d9f3207db0c79a3b154f3875c9760cc6b056904b>2cc6f...,d9f3207db0c79a3b154f3875c9760cc6b056904b>2cc6f...,3,,
4,explicit,239.0,Discussion,234.0,Ophthalmic symptoms are rare manifestations of...,background,1.0,88b86556857f4374842d2af2e359576806239175,a5bb0ff1a026944d2a47a155462959af2b8505a8,False,88b86556857f4374842d2af2e359576806239175>a5bb0...,88b86556857f4374842d2af2e359576806239175>a5bb0...,2,,


(8243, 15)

## 2. Data Cleaning

##### Final Columns
* source: text -> categorical
* citeEnd: float
* citeStart: float
* sectionName: text -> categorical
* string: text -> embedding (vectorized)
* iskeyCitation: bool -> categorical
* excerpt index: int
* label: text -> categorical

#### 2.1 Drop unwanted columns

In [4]:
df = df.drop(columns=['label_confidence','citingPaperId','citedPaperId','id','unique_id','label2','label2_confidence'])

display(
    df.columns,
    df.shape
)

Index(['source', 'citeEnd', 'sectionName', 'citeStart', 'string', 'label',
       'isKeyCitation', 'excerpt_index'],
      dtype='object')

(8243, 8)

#### 2.2 Drop selected null rows

In [5]:
for column in df.columns:
    print(f"{column} null value:",sum(df[column].isnull()))

source null value: 2
citeEnd null value: 2
sectionName null value: 19
citeStart null value: 2
string null value: 0
label null value: 0
isKeyCitation null value: 0
excerpt_index null value: 0


In [6]:
df = df.dropna(subset=['source', 'citeEnd', 'citeStart'])
display(
    f"Number of null values in 'source': {sum(df['source'].isnull())}",
    f"Number of null values in 'citeEnd': {sum(df['citeEnd'].isnull())}",
    f"Number of null values in 'citeStart': {sum(df['citeStart'].isnull())}"
)

"Number of null values in 'source': 0"

"Number of null values in 'citeEnd': 0"

"Number of null values in 'citeStart': 0"

#### 2.3 One-hot encoding for 'source' and 'isKeyCitation' columns

In [7]:
# Initialize OneHotEncoder
one_hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit and transform the 'source' column
one_hot_encoded_source = one_hot_encoder.fit_transform(df[['source']])
one_hot_columns_source = one_hot_encoder.get_feature_names_out(['source'])
df_one_hot_source = pd.DataFrame(one_hot_encoded_source, columns=one_hot_columns_source, index=df.index)

# Fit and transform the 'isKeyCitation' column
one_hot_encoded_key_citation = one_hot_encoder.fit_transform(df[['isKeyCitation']])
one_hot_columns_key_citation = one_hot_encoder.get_feature_names_out(['isKeyCitation'])
df_one_hot_key_citation = pd.DataFrame(one_hot_encoded_key_citation, columns=one_hot_columns_key_citation, index=df.index)

# Concatenate the one-hot encoded DataFrames with the original DataFrame
df = pd.concat([df, df_one_hot_source, df_one_hot_key_citation], axis=1)

# Drop the original categorical columns
df.drop(columns=['source', 'isKeyCitation'], inplace=True)

# Show one-hot encoded DataFrame and the new shape
print("Source Mapping:", one_hot_columns_source)
print("Key Citation Mapping:", one_hot_columns_key_citation)
df.shape

Source Mapping: ['source_acronym' 'source_acronymParen' 'source_andPhrase'
 'source_etAlPhrase' 'source_explicit' 'source_properNoun']
Key Citation Mapping: ['isKeyCitation_False' 'isKeyCitation_True']


(8241, 14)

#### 2.4 Clean 'sectionName' column

- replace NaN and empty strings with "Unknown"

In [8]:
# before replacement
num_nan = df['sectionName'].isna().sum()
num_empty = (df['sectionName'] == '').sum()
print(f"NaN values before replacement: {num_nan}")
print(f"Empty strings before replacement: {num_empty}")

# replacement
df['sectionName'] = df['sectionName'].replace('', np.nan).fillna("Unknown")

# after replacement
num_nan = df['sectionName'].isna().sum()
num_empty = (df['sectionName'] == '').sum()
print(f"\nNaN values after replacement: {num_nan}")
print(f"Empty strings after replacement: {num_empty}")

NaN values before replacement: 19
Empty strings before replacement: 587

NaN values after replacement: 0
Empty strings after replacement: 0


- convert string to lowercase

In [9]:
print(f"Num unique strings before lowercase: {len(df['sectionName'].unique())}")
df.loc[:, 'sectionName'] = df['sectionName'].apply(str.lower)
print(f"Num unique strings after lowercase: {len(df['sectionName'].unique())}")

Num unique strings before lowercase: 1147
Num unique strings after lowercase: 1074


- remove numbers and punctuation

In [10]:
print(f"Num unique strings before removing: {len(df['sectionName'].unique())}")
df['sectionName'] = df['sectionName'].str.replace(r'[\d{}]+'.format(string.punctuation), '', regex=True).str.strip()
print(f"Num unique strings after removing: {len(df['sectionName'].unique())}")

Num unique strings before removing: 1074
Num unique strings after removing: 807


- lemmatization

In [11]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = nltk.word_tokenize(text)  # Tokenize the text into words
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatize each word
    return " ".join(lemmatized_words)

print(f"Num unique strings before lemmatization: {len(df['sectionName'].unique())}")
df['sectionName'] = df['sectionName'].apply(lemmatize_text)
print(f"Num unique strings after lemmatization: {len(df['sectionName'].unique())}")

Num unique strings before lemmatization: 807
Num unique strings after lemmatization: 794


- mapping to pre-defined categories

In [12]:
# to be mapped to 'background'
background_keywords = ['background', 'introduction', 'related work', 'relatedwork', 'overview', \
            'literature review', 'summary', 'state of the art', 'prior work']

pattern = re.compile('|'.join(background_keywords), re.IGNORECASE)

print(f"Num unique strings before mapping: {len(df['sectionName'].unique())}")
df['sectionName'] = df['sectionName'].apply(lambda text: 'background' if pattern.search(text) else text)
print(f"Num unique strings after mapping: {len(df['sectionName'].unique())}")

Num unique strings before mapping: 794
Num unique strings after mapping: 765


In [13]:
# to be mapped to 'method'
method_keywords = ['methodology', 'method', 'approach', 'design', \
            'proposed', 'experiment', 'implementation', 'procedure', \
            'description', 'technique', 'technical', 'formulation', \
            'solution', 'collection', 'describ', 'measure', 'model', \
            'empirical', 'study', 'setup', 'setting', 'system', 'detail']

pattern = re.compile('|'.join(method_keywords), re.IGNORECASE)

print(f"Num unique strings before mapping: {len(df['sectionName'].unique())}")
df['sectionName'] = df['sectionName'].apply(lambda text: 'method' if pattern.search(text) else text)
print(f"Num unique strings after mapping: {len(df['sectionName'].unique())}")

Num unique strings before mapping: 765
Num unique strings after mapping: 625


In [14]:
# to be mapped to 'result'
result_keywords = ['result', 'discussion', 'evaluation', 'limitation', 'implication', 'comparison', \
            'contribution', 'analysis', 'test', 'compare']

pattern = re.compile('|'.join(result_keywords), re.IGNORECASE)

print(f"Num unique strings before mapping: {len(df['sectionName'].unique())}")
df['sectionName'] = df['sectionName'].apply(lambda text: 'result' if pattern.search(text) else text)
print(f"Num unique strings after mapping: {len(df['sectionName'].unique())}")

Num unique strings before mapping: 625
Num unique strings after mapping: 557


In [15]:
# to be mapped to 'conclusion'
conclusion_keywords = ['conclusion', 'concluding', 'future']

pattern = re.compile('|'.join(conclusion_keywords), re.IGNORECASE)

print(f"Num unique strings before mapping: {len(df['sectionName'].unique())}")
df['sectionName'] = df['sectionName'].apply(lambda text: 'conclusion' if pattern.search(text) else text)
print(f"Num unique strings after mapping: {len(df['sectionName'].unique())}")

Num unique strings before mapping: 557
Num unique strings after mapping: 551


In [16]:
# List of valid section names
valid_sections = {'background', 'result', 'conclusion', 'method', 'unknown'}

# Replace any other section name with 'unknown'
print(f"Num unique strings before mapping: {len(df['sectionName'].unique())}")
df['sectionName'] = df['sectionName'].apply(lambda text: text if text in valid_sections else 'unknown')
print(f"Num unique strings after mapping: {len(df['sectionName'].unique())}")

Num unique strings before mapping: 551
Num unique strings after mapping: 5


In [17]:
# Initialize OneHotEncoder
one_hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit and transform the 'sectionName' column (apply one-hot encoding)
section_encoded = one_hot_encoder.fit_transform(df[['sectionName']])
one_hot_columns = one_hot_encoder.get_feature_names_out(['sectionName'])
df_one_hot = pd.DataFrame(section_encoded, columns=one_hot_columns, index=df.index)

# Concatenate the original DataFrame with the one-hot encoded columns
df = pd.concat([df, df_one_hot], axis=1)

# Drop the original 'sectionName' column
df.drop(columns=['sectionName'], inplace=True)

df.head()

Unnamed: 0,citeEnd,citeStart,string,label,excerpt_index,source_acronym,source_acronymParen,source_andPhrase,source_etAlPhrase,source_explicit,source_properNoun,isKeyCitation_False,isKeyCitation_True,sectionName_background,sectionName_conclusion,sectionName_method,sectionName_result,sectionName_unknown
0,175.0,168.0,"However, how frataxin interacts with the Fe-S ...",background,11,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
1,36.0,16.0,"In the study by Hickey et al. (2012), spikes w...",background,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,228.0,225.0,"The drug also reduces catecholamine secretion,...",background,0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,110.0,46.0,By clustering with lowly aggressive close kin ...,background,3,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,239.0,234.0,Ophthalmic symptoms are rare manifestations of...,background,2,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


#### 2.5 Clean 'string' column
- Remove citations (e.g., [12,22]).
- Convert text to lowercase.
- Remove punctuation.
- Remove numbers.
- Remove stopwords.
- Lemmatize words using NLTK.
- Replace multiple spaces with a single space.

In [18]:
# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to clean text
def clean_text(text):
    if pd.isna(text):  # Handle NaN values
        return ""
    
    # Remove various citation formats
    text = re.sub(r"\[[\d,\s–-]+\]", "", text)  # Remove citations like [1–5], [21, 23, 33, 64, 100, 101, 107–118]
    text = re.sub(r"\([\d,\s–-]+\)", "", text)  # Remove citations like (96-98), (7)
    text = re.sub(r"\b[A-Z][a-z]+ et al\.\s*\(\d{4}\)", "", text)  # Doe et al. (2021)
    text = re.sub(r"\(\w+,\s*\d{4}(?:;\s*\w+,\s*\d{4})*\)", "", text)  # (Smith, 2020; Johnson, 2019)
    text = re.sub(r"\b(ICML|NeurIPS|CVPR|JMLR|arXiv|vol\.\s*\d+)\b", "", text)  # Journal/conference names

    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)  # Remove numbers
    
    tokens = word_tokenize(text)  # Tokenize text
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]  # Lemmatization & remove stopwords
    
    return " ".join(tokens)  # Reconstruct sentence

# Apply text cleaning function
df['string'] = df['string'].apply(clean_text)

# Display results
df.head()

Unnamed: 0,citeEnd,citeStart,string,label,excerpt_index,source_acronym,source_acronymParen,source_andPhrase,source_etAlPhrase,source_explicit,source_properNoun,isKeyCitation_False,isKeyCitation_True,sectionName_background,sectionName_conclusion,sectionName_method,sectionName_result,sectionName_unknown
0,175.0,168.0,however frataxin interacts fe cluster biosynth...,background,11,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
1,36.0,16.0,study hickey et al spike sampled field point p...,background,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,228.0,225.0,drug also reduces catecholamine secretion ther...,background,0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,110.0,46.0,clustering lowly aggressive close kin king ab ...,background,3,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,239.0,234.0,ophthalmic symptom rare manifestation intracra...,background,2,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


- TF-IDF Vectorization of 'string' column

In [19]:
# Initialize the vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Limit to top 5000 features

# Fit and transform the 'string' column
X_tfidf = vectorizer.fit_transform(df["string"])

# Convert the sparse matrix to a DataFrame
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=vectorizer.get_feature_names_out(), index=df.index)

# Rename columns of the TF-IDF DataFrame to avoid overlap (add '_tfidf' suffix)
tfidf_df.columns = [col + '_tfidf' for col in tfidf_df.columns]

# Drop the original 'string' column
df.drop(columns=['string'], inplace=True)

# Concatenate the original DataFrame with the TF-IDF DataFrame
df = pd.concat([df, tfidf_df], axis=1)

# Display the updated DataFrame
df.head()

Unnamed: 0,citeEnd,citeStart,label,excerpt_index,source_acronym,source_acronymParen,source_andPhrase,source_etAlPhrase,source_explicit,source_properNoun,...,zheng_tfidf,zhou_tfidf,zhu_tfidf,zinc_tfidf,zn_tfidf,zone_tfidf,äì_tfidf,ðþ_tfidf,βarr_tfidf,μm_tfidf
0,175.0,168.0,background,11,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,36.0,16.0,background,2,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,228.0,225.0,background,0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,110.0,46.0,background,3,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,239.0,234.0,background,2,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### 2.6 Label encoding for 'label' column

In [20]:
# Initialize Label Encoder
label_encoder = LabelEncoder()

# Encode labels directly in the 'label' column
df["label"] = label_encoder.fit_transform(df["label"])
joblib.dump(label_encoder, 'scicite_preprocessed/label_encoder.pkl')

# Show label mappings
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Labels:", label_mapping)

Labels: {'background': 0, 'method': 1, 'result': 2}


## Save processed dataset

In [21]:
df.to_csv("scicite_preprocessed/" + dataset + ".csv", index=False)
# scipy.sparse.save_npz("scicite_preprocessed/" + dataset + "_tfidf_matrix.npz", X_tfidf)

# TODO: ML pipeline

- [x] Clean data
- [ ] Upsampling
- [x] Word vectorization
- [ ] Word embedding
- [ ] PCA 
- [ ] top-k features selection
- [ ] Training the classfication model