In [6]:
import pandas as pd
import re

In [None]:
# read data
df = pd.read_csv('df_combined.csv')
df

Unnamed: 0,title,authors,year_published,number,volume,journal,type,content,doi
0,Introduction: Understanding Roots and Betweenn...,"Roy Krøvel, Fabrizio Palumbo and Kristin Skare...",2023,7.0,24.0,Journalism Studies,article,Journalism Studies\nISSN: (Print) (Online) Jou...,10.1080/1461670X.2023.2206494
1,Optimized protocol for conditioned place avoid...,Fabrizio Palumbo and Bram Serneels and Emre Yaksi,2021,2.0,2.0,STAR Protocols,article,Protocol\nOptimized protocol for conditioned p...,https://doi.org/10.1016/j.xpro.2021.100465
2,COST-WINNERS: COST reduction WIth Neural NEtwo...,Sven Myrdahl Opalic and Fabrizio Palumbo and M...,2023,,72.0,Journal of Energy Storage,article,Journal of Energy Storage 72 (2023) 108202\nAv...,https://doi.org/10.1016/j.est.2023.108202
3,Genetic Algorithms For Tightening Security,"Palumbo, Fabrizio and Buji, Adam and Yazidi, A...",2022,,,,inproceedings,Genetic Algorithms For Tightening Security\n1s...,10.23919/WMNC56391.2022.9954297
4,How (not to) Run an AI Project in Investigativ...,"M. Fridman, R. Krøvel and F. Palumbo",2023,0.0,0.0,Journalism Practice,article,Journalism Practice\nISSN: (Print) (Online) Jo...,10.1080/17512786.2023.2253797
5,Information processing in the vertebrate habenula,Stephanie Fore and Fabrizio Palumbo and Robbre...,2018,,78.0,Seminars in Cell & Developmental Biology,article,Seminars\n in\n Cell\n &\n Developmental\n Bio...,https://doi.org/10.1016/j.semcdb.2017.08.019
6,Functional properties of habenular neurons are...,Stephanie Fore and Francisca Acuña-Hinrichsen...,2020,36.0,6.0,Science Advances,article,"Fore et al., Sci. Adv. 2020; 6 : eaaz3173 ...",10.1126/sciadv.aaz3173


In [4]:
# replace missing values
df.fillna('missing')

Unnamed: 0,title,authors,year_published,number,volume,journal,type,content,doi
0,Introduction: Understanding Roots and Betweenn...,"Roy Krøvel, Fabrizio Palumbo and Kristin Skare...",2023,7.0,24.0,Journalism Studies,article,Journalism Studies\nISSN: (Print) (Online) Jou...,10.1080/1461670X.2023.2206494
1,Optimized protocol for conditioned place avoid...,Fabrizio Palumbo and Bram Serneels and Emre Yaksi,2021,2.0,2.0,STAR Protocols,article,Protocol\nOptimized protocol for conditioned p...,https://doi.org/10.1016/j.xpro.2021.100465
2,COST-WINNERS: COST reduction WIth Neural NEtwo...,Sven Myrdahl Opalic and Fabrizio Palumbo and M...,2023,missing,72.0,Journal of Energy Storage,article,Journal of Energy Storage 72 (2023) 108202\nAv...,https://doi.org/10.1016/j.est.2023.108202
3,Genetic Algorithms For Tightening Security,"Palumbo, Fabrizio and Buji, Adam and Yazidi, A...",2022,missing,missing,missing,inproceedings,Genetic Algorithms For Tightening Security\n1s...,10.23919/WMNC56391.2022.9954297
4,How (not to) Run an AI Project in Investigativ...,"M. Fridman, R. Krøvel and F. Palumbo",2023,0.0,0.0,Journalism Practice,article,Journalism Practice\nISSN: (Print) (Online) Jo...,10.1080/17512786.2023.2253797
5,Information processing in the vertebrate habenula,Stephanie Fore and Fabrizio Palumbo and Robbre...,2018,missing,78.0,Seminars in Cell & Developmental Biology,article,Seminars\n in\n Cell\n &\n Developmental\n Bio...,https://doi.org/10.1016/j.semcdb.2017.08.019
6,Functional properties of habenular neurons are...,Stephanie Fore and Francisca Acuña-Hinrichsen...,2020,36.0,6.0,Science Advances,article,"Fore et al., Sci. Adv. 2020; 6 : eaaz3173 ...",10.1126/sciadv.aaz3173


In [None]:
df[['content']].head(8)

In [7]:
def clean_text(text):
    '''Cleans content column'''

    # remove journal homepage URLs
    text = re.sub(r'https?://\S+', '', text)

    # remove ISSN numbers
    text = re.sub(r'ISSN:\s*\(.*?\)', '', text)

    # remove copyright/license info (e.g., CC BY-NC-ND, © info)
    text = re.sub(r'©\s*\d{4}.*?license.*', '', text, flags=re.IGNORECASE)

    # fix common OCR errors
    text = text.replace("ﬆ", "st").replace("deﬁning", "defining").replace("journaliﬆs", "journalists")
    text = text.replace("con- tained", "contained").replace("vol- ume", "volume")  # More examples
    
    # remove unwanted "Downloaded from" lines or publication data
    text = re.sub(r'Downloaded from .+? on \w+ \d{2}, \d{4}', '', text)

    # normalize spaces and remove unwanted newlines
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Apply cleaning function to the content column
df["cleaned_content"] = df["content"].apply(clean_text)


In [8]:
#pd.set_option('display.max_colwidth', None)
df[['cleaned_content']].head(8)

Unnamed: 0,cleaned_content
0,Journalism Studies (Online) Journal homepage: ...
1,Protocol Optimized protocol for conditioned pl...
2,Journal of Energy Storage 72 (2023) 108202 Ava...
3,Genetic Algorithms For Tightening Security 1st...
4,Journalism Practice (Online) Journal homepage:...
5,Seminars in Cell & Developmental Biology 78 (2...
6,"Fore et al., Sci. Adv. 2020; 6 : eaaz3173 4 Se..."


In [9]:
df

Unnamed: 0,title,authors,year_published,number,volume,journal,type,content,doi,cleaned_content
0,Introduction: Understanding Roots and Betweenn...,"Roy Krøvel, Fabrizio Palumbo and Kristin Skare...",2023,7.0,24.0,Journalism Studies,article,Journalism Studies\nISSN: (Print) (Online) Jou...,10.1080/1461670X.2023.2206494,Journalism Studies (Online) Journal homepage: ...
1,Optimized protocol for conditioned place avoid...,Fabrizio Palumbo and Bram Serneels and Emre Yaksi,2021,2.0,2.0,STAR Protocols,article,Protocol\nOptimized protocol for conditioned p...,https://doi.org/10.1016/j.xpro.2021.100465,Protocol Optimized protocol for conditioned pl...
2,COST-WINNERS: COST reduction WIth Neural NEtwo...,Sven Myrdahl Opalic and Fabrizio Palumbo and M...,2023,,72.0,Journal of Energy Storage,article,Journal of Energy Storage 72 (2023) 108202\nAv...,https://doi.org/10.1016/j.est.2023.108202,Journal of Energy Storage 72 (2023) 108202 Ava...
3,Genetic Algorithms For Tightening Security,"Palumbo, Fabrizio and Buji, Adam and Yazidi, A...",2022,,,,inproceedings,Genetic Algorithms For Tightening Security\n1s...,10.23919/WMNC56391.2022.9954297,Genetic Algorithms For Tightening Security 1st...
4,How (not to) Run an AI Project in Investigativ...,"M. Fridman, R. Krøvel and F. Palumbo",2023,0.0,0.0,Journalism Practice,article,Journalism Practice\nISSN: (Print) (Online) Jo...,10.1080/17512786.2023.2253797,Journalism Practice (Online) Journal homepage:...
5,Information processing in the vertebrate habenula,Stephanie Fore and Fabrizio Palumbo and Robbre...,2018,,78.0,Seminars in Cell & Developmental Biology,article,Seminars\n in\n Cell\n &\n Developmental\n Bio...,https://doi.org/10.1016/j.semcdb.2017.08.019,Seminars in Cell & Developmental Biology 78 (2...
6,Functional properties of habenular neurons are...,Stephanie Fore and Francisca Acuña-Hinrichsen...,2020,36.0,6.0,Science Advances,article,"Fore et al., Sci. Adv. 2020; 6 : eaaz3173 ...",10.1126/sciadv.aaz3173,"Fore et al., Sci. Adv. 2020; 6 : eaaz3173 4 Se..."
