In [7]:
import re
import pandas as pd

In [8]:
# read data
df = pd.read_csv('df_combined.csv')
#pd.set_option('display.max_colwidth', None)
df['title'].head()

0    Contrastive autoencoder for anomaly detection ...
1    Detection of abnormality in wireless capsule e...
2      Single image dehazing using a new color channel
3    Estimating Tukey depth using incremental quant...
4    Advanced Passive Operating System Fingerprinti...
Name: title, dtype: object

In [9]:
df = df.fillna('missing')
df.head()

Unnamed: 0,title,authors,year_published,number,volume,journal,type,content,doi,file
0,Contrastive autoencoder for anomaly detection ...,Hao Zhou and Ke Yu and Xuan Zhang and Guanlin ...,2022,missing,610,Information Sciences,article,Contrastive autoencoder for anomaly detection ...,https://doi.org/10.1016/j.ins.2022.07.179,doc16
1,Detection of abnormality in wireless capsule e...,Samir Jain and Ayan Seal and Aparajita Ojha an...,2020,missing,127,Computers in Biology and Medicine,article,Computers in Biology and Medicine 127 (2020) 1...,https://doi.org/10.1016/j.compbiomed.2020.104094,doc17
2,Single image dehazing using a new color channel,Geet Sahu and Ayan Seal and Ondrej Krejcar and...,2021,missing,74,Journal of Visual Communication and Image Repr...,article,J. Vis. Commun. Image R. 74 (2021) 103008\nAva...,https://doi.org/10.1016/j.jvcir.2020.103008,doc15
3,Estimating Tukey depth using incremental quant...,Hugo L. Hammer and Anis Yazidi and Håvard Rue,2022,missing,122,Pattern Recognition,article,Pattern Recognition 122 (2022) 108339 \nConten...,https://doi.org/10.1016/j.patcog.2021.108339,doc29
4,Advanced Passive Operating System Fingerprinti...,"Hagos, Desta Haileselassie and Løland, Martin ...",2020,missing,missing,missing,inproceedings,Advanced Passive Operating System Fingerprinti...,10.1109/ICCCN49398.2020.9209694,doc28


In [10]:
df['content'].str.contains(r'(?i)\babstract\b', regex=True)


0     False
1     False
2     False
3     False
4      True
5      True
6      True
7      True
8     False
9      True
10     True
11    False
12    False
13    False
14    False
15     True
16     True
17    False
18    False
19    False
20    False
21    False
22     True
23     True
24    False
25    False
26     True
27    False
28     True
29    False
30     True
31     True
32     True
33     True
34    False
35     True
36    False
Name: content, dtype: bool

In [11]:
# Function to extract the abstract
def extract_abstract(text):
    match = re.search(r'(?i)\babstract\b(.*?)(?=\b(introduction|keywords|1\.|I\.)\b|$)', text, re.DOTALL)
    return match.group(1).strip() if match else None

# Keep only documents that have an abstract
df = df[df['content'].str.contains(r'(?i)\babstract\b', regex=True)].copy()

# Extract the abstract
df['abstract'] = df['content'].apply(extract_abstract)

# Remove the abstract from the content column
df['content'] = df.apply(lambda row: row['content'].replace(row['abstract'], '', 1) if row['abstract'] else row['content'], axis=1)
df

Unnamed: 0,title,authors,year_published,number,volume,journal,type,content,doi,file,abstract
4,Advanced Passive Operating System Fingerprinti...,"Hagos, Desta Haileselassie and Løland, Martin ...",2020,missing,missing,missing,inproceedings,Advanced Passive Operating System Fingerprinti...,10.1109/ICCCN49398.2020.9209694,doc28,"—Securing and managing large, complex enterpri..."
5,Artificial intelligence in the fertility clini...,"Riegler, M A and Stensen, M H and Witczak, O a...",2021,9.0,36,Human Reproduction,article,.................................................,10.1093/humrep/deab168,doc14,": In recent years, the amount of data produced..."
6,EvoDynamic: A Framework for the Evolution of G...,"Pontes-Filho, Sidney\nand Lind, Pedro\nand Yaz...",2020,missing,missing,missing,inproceedings,EvoDynamic: a framework for the evolution of\n...,missing,doc10,. Dynamical systems possess a computational ca...
7,A general representation of dynamical systems ...,Sidney Pontes{-}Filho and\nAnis Yazidi and\nJi...,2019,missing,abs/1907.01856,CoRR,article,A general representation of dynamical systems ...,missing,doc11,—Dynamical systems are capable of performing c...
9,Exploring Multilingual Word Embedding Alignmen...,"Aaby, Pernille\nand Biermann, Daniel\nand Yazi...",2023,missing,missing,missing,inproceedings,"\n \n \n \nAccepted manuscript \nAaby, P., Bi...",missing,doc12,". Contextual language models, such as transfor..."
10,A Deep Learning-Based Tool for Automatic Brain...,"Pontes-Filho, Sidney\nand Dahl, Annelene Gulde...",2022,missing,missing,missing,inproceedings,A deep learning based tool for automatic brain...,missing,doc9,—Removing skull artifacts from functional magn...
15,Genetic Algorithms For Tightening Security,"Palumbo, Fabrizio and Buji, Adam and Yazidi, A...",2022,missing,missing,missing,inproceedings,Genetic Algorithms For Tightening Security\n1s...,10.23919/WMNC56391.2022.9954297,doc7,—Proper conﬁguration of operating systems and ...
16,How (not to) Run an AI Project in Investigativ...,"M. Fridman, R. Krøvel and F. Palumbo",2023,0.0,0,Journalism Practice,article,Journalism Practice\nISSN: (Print) (Online) Jo...,10.1080/17512786.2023.2253797,doc3,Data journalists are increasingly reliant on a...
22,A New Adaptive Mixture Distance-Based Improved...,"Sharma, Krishna Kumar and Seal, Ayan and Yazid...",2022,missing,71,IEEE Transactions on Instrumentation and Measu...,article,IEEE TRANSACTIONS ON INSTRUMENTATION AND MEASU...,10.1109/TIM.2022.3216366,doc22,—With the rapid development of sensors and mec...
23,A sensitivity analysis of cellular automata an...,"Tom Eivind Glover, Ruben Jahren, Francesco Mar...",2025,1.0,40,"International Journal of Parallel, Emergent an...",article,"International Journal of Parallel, Emergent an...",10.1080/17445760.2024.2396334,doc34,Elementary Cellular Automata (ECA) are well-st...


In [16]:
def clean_text(text):
    '''Cleans content column'''

    # remove journal homepage URLs
    text = re.sub(r'https?://\S+', '', text) 

    # remove ISSN numbers
    text = re.sub(r'ISSN:\s*\(.*?\)', '', text)

    # remove copyright/license info
    text = re.sub(r'©\s*\d{4}.*?license.*', '', text, flags=re.IGNORECASE)

    # remove unwanted "Downloaded from" lines or publication data
    text = re.sub(r'Downloaded from .+? on \w+ \d{2}, \d{4}', '', text)

    # remove any reference to URLs or links in text that aren’t essential
    text = re.sub(r'\bwww\.\S+', '', text) 

    # remove unwanted citations and references (vurder å fjerne dette)
    text = re.sub(r'REFERENCES AND NOTES.*', '', text, flags=re.DOTALL)  # remove references and notes section
    text = re.sub(r'\(\d+\)', '', text)  # remove inline citations 
    text = re.sub(r'\*\*P.*?Wilcoxon.*?\n', '', text)  # remove statistical results

    # normalize spaces and remove unwanted newlines
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [18]:
# apply cleaning
df["cleaned_content"] = df["content"].apply(clean_text)
df["cleaned_abstract"] = df["abstract"].apply(clean_text)

In [19]:
#pd.set_option('display.max_colwidth', None)
df[['cleaned_content']].head()

Unnamed: 0,cleaned_content
4,Advanced Passive Operating System Fingerprinti...
5,.................................................
6,EvoDynamic: a framework for the evolution of g...
7,A general representation of dynamical systems ...
9,"Accepted manuscript Aaby, P., Biermann, D., Ya..."


In [20]:
# save cleaned df to csv file
df.to_csv('df_cleaned.csv', index=False)