In [None]:
import re
import pandas as pd

In [None]:
# read data
df = pd.read_csv('df_combined.csv')
#pd.set_option('display.max_colwidth', None)
df['title'].head()

In [None]:
df = df.fillna('missing')
df.head()

In [None]:
df['content'].str.contains(r'(?i)\babstract\b', regex=True)


In [None]:
# Function to extract the abstract
def extract_abstract(text):
    match = re.search(r'(?i)\babstract\b(.*?)(?=\b(introduction|keywords|1\.|I\.)\b|$)', text, re.DOTALL)
    return match.group(1).strip() if match else None

# Keep only documents that have an abstract
df = df[df['content'].str.contains(r'(?i)\babstract\b', regex=True)].copy()

# Extract the abstract
df['abstract'] = df['content'].apply(extract_abstract)

# Remove the abstract from the content column
df['content'] = df.apply(lambda row: row['content'].replace(row['abstract'], '', 1) if row['abstract'] else row['content'], axis=1)
df

In [None]:
def clean_text(text):
    '''Cleans content column'''

    # remove journal homepage URLs
    text = re.sub(r'https?://\S+', '', text) 

    # remove ISSN numbers
    text = re.sub(r'ISSN:\s*\(.*?\)', '', text)

    # remove copyright/license info
    text = re.sub(r'©\s*\d{4}.*?license.*', '', text, flags=re.IGNORECASE)

    # remove unwanted "Downloaded from" lines or publication data
    text = re.sub(r'Downloaded from .+? on \w+ \d{2}, \d{4}', '', text)

    # remove any reference to URLs or links in text that aren’t essential
    text = re.sub(r'\bwww\.\S+', '', text) 

    # remove unwanted citations and references (vurder å fjerne dette)
    text = re.sub(r'REFERENCES AND NOTES.*', '', text, flags=re.DOTALL)  # remove references and notes section
    text = re.sub(r'\(\d+\)', '', text)  # remove inline citations 
    text = re.sub(r'\*\*P.*?Wilcoxon.*?\n', '', text)  # remove statistical results

    # normalize spaces and remove unwanted newlines
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [None]:
# apply cleaning
df["cleaned_content"] = df["content"].apply(clean_text)
df["cleaned_abstract"] = df["abstract"].apply(clean_text)

In [None]:
#pd.set_option('display.max_colwidth', None)
df[['cleaned_content']].head()

In [None]:
# save cleaned df to csv file
df.to_csv('df_cleaned.csv', index=False)