In [None]:
import pandas as pd
import re

In [None]:
# read data
df = pd.read_csv('df_combined.csv')
df.head()

In [None]:
df[['content']].head(8)

In [None]:
df.fillna('missing')
df.head()

In [None]:
def clean_text(text):
    '''Cleans content column'''

    # remove journal homepage URLs
    text = re.sub(r'https?://\S+', '', text)

    # remove ISSN numbers
    text = re.sub(r'ISSN:\s*\(.*?\)', '', text)

    # remove copyright/license info (e.g., CC BY-NC-ND, © info)
    text = re.sub(r'©\s*\d{4}.*?license.*', '', text, flags=re.IGNORECASE)

    # fix common OCR errors
    text = text.replace("ﬆ", "st").replace("deﬁning", "defining").replace("journaliﬆs", "journalists")
    text = text.replace("con- tained", "contained").replace("vol- ume", "volume")  # More examples
    
    # remove unwanted "Downloaded from" lines or publication data
    text = re.sub(r'Downloaded from .+? on \w+ \d{2}, \d{4}', '', text)

    # normalize spaces and remove unwanted newlines
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [None]:
# apply cleaning
df["cleaned_content"] = df["content"].apply(clean_text)

In [None]:
#pd.set_option('display.max_colwidth', None)
df[['cleaned_content']].head(8)

In [None]:
# save cleaned df to csv file
df.to_csv('df_cleaned.csv', index=False)