In [1]:
import pandas as pd
import re

In [11]:
# read data
df = pd.read_csv('df_combined.csv')
pd.set_option('display.max_colwidth', None)
df['title'].head(30)

0                                                                                                  Contrastive autoencoder for anomaly detection in multivariate time series
1                                                                                       Detection of abnormality in wireless capsule endoscopy images using fractal features
2                                                                                                                            Single image dehazing using a new color channel
3                                                                                                               Estimating Tukey depth using incremental quantile estimators
4                                                                                  Advanced Passive Operating System Fingerprinting Using Machine Learning and Deep Learning
5                                                                                        Artificial intelligence in the fertility clini

In [3]:
df[['content']].head(8)

Unnamed: 0,content
0,Contrastive autoencoder for anomaly detection ...
1,Computers in Biology and Medicine 127 (2020) 1...
2,J. Vis. Commun. Image R. 74 (2021) 103008\nAva...
3,Pattern Recognition 122 (2022) 108339 \nConten...
4,Advanced Passive Operating System Fingerprinti...
5,.................................................
6,EvoDynamic: a framework for the evolution of\n...
7,A general representation of dynamical systems ...


In [4]:
df.fillna('missing')
df.head()

Unnamed: 0,title,authors,year_published,number,volume,journal,type,content,doi,file
0,Contrastive autoencoder for anomaly detection ...,Hao Zhou and Ke Yu and Xuan Zhang and Guanlin ...,2022,,610.0,Information Sciences,article,Contrastive autoencoder for anomaly detection ...,https://doi.org/10.1016/j.ins.2022.07.179,doc16
1,Detection of abnormality in wireless capsule e...,Samir Jain and Ayan Seal and Aparajita Ojha an...,2020,,127.0,Computers in Biology and Medicine,article,Computers in Biology and Medicine 127 (2020) 1...,https://doi.org/10.1016/j.compbiomed.2020.104094,doc17
2,Single image dehazing using a new color channel,Geet Sahu and Ayan Seal and Ondrej Krejcar and...,2021,,74.0,Journal of Visual Communication and Image Repr...,article,J. Vis. Commun. Image R. 74 (2021) 103008\nAva...,https://doi.org/10.1016/j.jvcir.2020.103008,doc15
3,Estimating Tukey depth using incremental quant...,Hugo L. Hammer and Anis Yazidi and Håvard Rue,2022,,122.0,Pattern Recognition,article,Pattern Recognition 122 (2022) 108339 \nConten...,https://doi.org/10.1016/j.patcog.2021.108339,doc29
4,Advanced Passive Operating System Fingerprinti...,"Hagos, Desta Haileselassie and Løland, Martin ...",2020,,,,inproceedings,Advanced Passive Operating System Fingerprinti...,10.1109/ICCCN49398.2020.9209694,doc28


In [5]:
def clean_text(text):
    '''Cleans content column'''

    # remove journal homepage URLs
    text = re.sub(r'https?://\S+', '', text)

    # remove ISSN numbers
    text = re.sub(r'ISSN:\s*\(.*?\)', '', text)

    # remove copyright/license info (e.g., CC BY-NC-ND, © info)
    text = re.sub(r'©\s*\d{4}.*?license.*', '', text, flags=re.IGNORECASE)

    # fix common OCR errors
    #text = text.replace("ﬆ", "st").replace("deﬁning", "defining").replace("journaliﬆs", "journalists")
    #text = text.replace("con- tained", "contained").replace("vol- ume", "volume")  # More examples
    
    # remove unwanted "Downloaded from" lines or publication data
    text = re.sub(r'Downloaded from .+? on \w+ \d{2}, \d{4}', '', text)

    # normalize spaces and remove unwanted newlines
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [6]:
# apply cleaning
df["cleaned_content"] = df["content"].apply(clean_text)

In [7]:
#pd.set_option('display.max_colwidth', None)
df[['cleaned_content']].head(8)

Unnamed: 0,cleaned_content
0,Contrastive autoencoder for anomaly detection ...
1,Computers in Biology and Medicine 127 (2020) 1...
2,J. Vis. Commun. Image R. 74 (2021) 103008 Avai...
3,Pattern Recognition 122 (2022) 108339 Contents...
4,Advanced Passive Operating System Fingerprinti...
5,.................................................
6,EvoDynamic: a framework for the evolution of g...
7,A general representation of dynamical systems ...


In [8]:
# save cleaned df to csv file
df.to_csv('df_cleaned.csv', index=False)