## <center> Data Preprocessing </center>

In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm

In [2]:
%run 00_lib_preprocessing.ipynb
%run 00_lib_sqlwriter.ipynb

### Class Library

### Load data

In [3]:
data_ingestion = DataIngestion_MySQL()
text_preprocessor = TextPreprocessor()

volumes_df, archives_df, articles_df, contents_df, authors_df, authors_articles_df = data_ingestion.load_data()

Table 'volumes' is ready.
Table 'archives' is ready.
Table 'articles' is ready.
Table 'contents' is ready.
Table 'authors' is ready.
Table 'authors_articles' is ready.
[INFO] Loading data files...


In [4]:
display(archives_df.head(1))
display(articles_df.head(1))
display(contents_df.head(1))

display(authors_df.head(1))
display(authors_articles_df.head(1))

Unnamed: 0,archive_url,volume_number,archive_title,archive_title_clean,archive_publication_date,editor,import_date,status
0,https://firstmonday.org/ojs/index.php/fm/issue...,"Volume 4, Number 1 - 4 January 1999",,,1999-01-04,,2025-03-12 18:27:25,COMPLETED


Unnamed: 0,article_url,article_title,article_title_clean,doi,article_publication_date,author,author_clean,keyword,abstract,abstract_clean,archive_url,content_url,import_date,status,lang,abstract_clean_en
0,https://firstmonday.org/ojs/index.php/fm/artic...,The Lives and Death of Moore's Law,the lives and death of moore's law,https://doi.org/10.5210/fm.v7i11.1000,2002-11-04,Ilkka Tuomi,,,Moore's Law has been an important benchmark f...,,https://firstmonday.org/ojs/index.php/fm/issue...,https://firstmonday.org/ojs/index.php/fm/artic...,2025-03-13 15:28:07,COMPLETED,en,moore's law has been an important benchmark f...


Unnamed: 0,content_url,iframe_url,content,content_clean,lang,content_clean_en
0,https://firstmonday.org/ojs/index.php/fm/artic...,https://firstmonday.org/ojs/index.php/fm/artic...,The lives and death of Moore's Law\nMoore’s La...,the lives and death of moore's law moore s law...,en,


### Preprocess Archives

In [5]:
archives_df = data_ingestion.clean_archive_titles(archives_df)
filtered_archives_df = archives_df[archives_df['archive_title']!='']
display(filtered_archives_df.head(1))

Unnamed: 0,archive_url,volume_number,archive_title,archive_title_clean,archive_publication_date,editor,import_date,status
108,https://firstmonday.org/ojs/index.php/fm/issue...,"Volume 12, Number 6 — 4 June 2007",Special Issue: Designing Cyberinfrastructure,special issue: designing cyberinfrastructure,2007-06-04,,2025-03-12 18:32:40,COMPLETED


### Preprocess Articles

In [6]:
articles_df = data_ingestion.clean_articles(articles_df)
articles_df = data_ingestion.clean_article_abstracts(articles_df)
articles_df = data_ingestion.clean_article_titles(articles_df)
articles_df = data_ingestion.clean_article_authors(articles_df)
articles_df.head(1)

[INFO] Cleaning articles...


Unnamed: 0,article_url,article_title,article_title_clean,doi,article_publication_date,author,author_clean,keyword,abstract,abstract_clean,archive_url,content_url,import_date,status,lang,abstract_clean_en
0,https://firstmonday.org/ojs/index.php/fm/artic...,The Lives and Death of Moore's Law,the lives and death of moore's law,https://doi.org/10.5210/fm.v7i11.1000,2002-11-04,Ilkka Tuomi,Ilkka Tuomi,,Moore's Law has been an important benchmark f...,moore's law has been an important benchmark f...,https://firstmonday.org/ojs/index.php/fm/issue...,https://firstmonday.org/ojs/index.php/fm/artic...,2025-03-13 15:28:07,COMPLETED,en,moore's law has been an important benchmark f...


In [7]:
# Remove special characters
tqdm.pandas(desc="Preprocessing texts")
articles_df['abstract_clean'] = articles_df['abstract_clean'].progress_apply(text_preprocessor.preprocess_text)

Preprocessing texts: 100%|██████████| 2677/2677 [00:00<00:00, 15267.30it/s]


In [8]:
# Remove rows where 'abstract' is NaN or an empty string
articles_df = articles_df.dropna(subset=['abstract_clean']).reset_index(drop=True)
articles_df = articles_df[articles_df['abstract_clean'].str.strip() != ''].reset_index(drop=True)

# Detect language
articles_df['lang'] = articles_df['abstract_clean'].apply(lambda x: pd.Series(text_preprocessor.detect_language(x)))

In [9]:
articles_df['abstract_clean_en'] = articles_df.apply(
    lambda row: text_preprocessor.translate_to_english(row['lang'], row['abstract_clean']),
    axis=1
)

In [10]:
# Update to database
data_ingestion.mysql_writer.update_article_clean(articles_df)
display(articles_df.head(1))

Articles preprocessing is completed


Unnamed: 0,article_url,article_title,article_title_clean,doi,article_publication_date,author,author_clean,keyword,abstract,abstract_clean,archive_url,content_url,import_date,status,lang,abstract_clean_en
0,https://firstmonday.org/ojs/index.php/fm/artic...,The Lives and Death of Moore's Law,the lives and death of moore's law,https://doi.org/10.5210/fm.v7i11.1000,2002-11-04,Ilkka Tuomi,Ilkka Tuomi,,Moore's Law has been an important benchmark f...,moore's law has been an important benchmark f...,https://firstmonday.org/ojs/index.php/fm/issue...,https://firstmonday.org/ojs/index.php/fm/artic...,2025-03-13 15:28:07,COMPLETED,en,moore's law has been an important benchmark f...


### Preprocess Contents

In [11]:
contents_df = data_ingestion.clean_article_contents(contents_df)
tqdm.pandas(desc="Preprocessing texts")
contents_df['content_clean'] = contents_df['content_clean'].progress_apply(text_preprocessor.preprocess_text)

Preprocessing texts: 100%|██████████| 2674/2674 [00:13<00:00, 196.33it/s]


In [12]:
# Update language value

# Create subsets of each DataFrame with the specified columns
articles_df = articles_df[['content_url', 'lang']]
contents_df = contents_df[['content_url',  'iframe_url', 'content', 'content_clean', 'content_clean_en']]

# Merge the subsets on the 'content_url' column
articles_contents_df = pd.merge(contents_df, articles_df, on='content_url', how='left')
articles_contents_df = articles_contents_df.fillna('')
articles_contents_df.head(1)

Unnamed: 0,content_url,iframe_url,content,content_clean,content_clean_en,lang
0,https://firstmonday.org/ojs/index.php/fm/artic...,https://firstmonday.org/ojs/index.php/fm/artic...,The lives and death of Moore's Law\nMoore’s La...,the lives and death of moore's law moore s law...,,en


In [13]:
# Update to database
data_ingestion.mysql_writer.update_content_clean(articles_contents_df)

Contents preprocessing is completed.
