## <center> Data Preprocessing </center>

In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm

In [2]:
%run 00_lib_preprocessing.ipynb
%run 00_lib_sqlwriter.ipynb

### Class Library

### Load data

In [3]:
data_ingestion = DataIngestion_MySQL()
text_preprocessor = TextPreprocessor()

volumes_df, archives_df, articles_df, contents_df, authors_df, authors_articles_df = data_ingestion.load_data()

Table 'volumes' is ready.
Table 'archives' is ready.
Table 'articles' is ready.
Table 'contents' is ready.
Table 'authors' is ready.
Table 'authors_articles' is ready.
[INFO] Loading data files...


In [4]:
display(volumes_df.head(1))
display(archives_df.head(1))
display(articles_df.head(1))
display(contents_df.head(1))
display(authors_df.head(1))

Unnamed: 0,archive_url,volume_number,archive_title,archive_title_clean,archive_publication_date,editor,import_date,status
0,https://firstmonday.org/ojs/index.php/fm/issue...,"Volume 4, Number 1 - 4 January 1999",,,1999-01-04,,2025-03-12 18:27:25,COMPLETED


Unnamed: 0,article_url,article_title,article_title_clean,doi,article_publication_date,author,author_clean,keyword,abstract,abstract_clean,archive_url,content_url,import_date,status,lang,abstract_clean_en
0,https://firstmonday.org/ojs/index.php/fm/artic...,The Lives and Death of Moore's Law,the lives and death of moore's law,https://doi.org/10.5210/fm.v7i11.1000,2002-11-04,Ilkka Tuomi,Ilkka Tuomi,,Moore's Law has been an important benchmark f...,moore's law has been an important benchmark f...,https://firstmonday.org/ojs/index.php/fm/issue...,https://firstmonday.org/ojs/index.php/fm/artic...,2025-03-13 15:28:07,COMPLETED,en,moore's law has been an important benchmark f...


Unnamed: 0,content_url,iframe_url,content,content_clean,lang,content_clean_en
0,https://firstmonday.org/ojs/index.php/fm/artic...,https://firstmonday.org/ojs/index.php/fm/artic...,The lives and death of Moore's Law\nMoore’s La...,the lives and death of moore's law moore s law...,en,


### Extract Volumes

In [5]:
import re

# Functions to extract special issue and volume name

def extract_volume_name(text):
    # Extracting volume name
    # Sample "Volume 1, Number 1 - 6 May 1996"

    # Extract volume name
    text_split = text.split('Number')
    if len(text_split) > 1:
        volume_name = text_split[0].strip().replace(',', '').replace('.', '')
        # print(f"Volume name: {volume_name}")

    # Extract year information
    year_split = text.split(';')
    # volume_year = re.search(r'\d{4}', year_split)
    if len(year_split)>1:
        volume_year =  year_split[-1]
        # print(f"Year: {volume_year}")
    return volume_name + ', ' + volume_year

def extract_special_issue(text):
    # Extracting special issue
    # Sample "Special Issue #1: Music and the Internet — 4 July 2005"
    text_split = text.split(':')
    if len(text_split) > 1:
        issue_name = text_split[0].strip()
        # print(f"Special Issue: {issue_name}")
    
    # Extract year information
    year_split = text.split(';')
    # issue_year = re.search(r'\d{4}', year_split)
    if len(year_split)>1:
        issue_year =  year_split[-1]
        # print(f"Year: {issue_year}")
    return issue_name + ', ' + issue_year

def split_volume(text):
    parts = text.split(',')
    volume_name = parts[0].strip() if len(parts) > 0 else ""
    volume_year = parts[1].strip() if len(parts) > 1 else ""
    return volume_name, volume_year

# Get unique special issue or volume names
archives_df['unique_volume_number'] = archives_df['volume_number'] + ';' + archives_df['archive_publication_date'].astype(str).str[:4]
archives_df['unique_volume_number'] = archives_df['unique_volume_number'].apply(
    lambda x: extract_volume_name(x) if "Volume" in x else extract_special_issue(x)
)
volume_numbers = archives_df['unique_volume_number']
unique_volume_number = list(set(volume_numbers))

# Create a new DataFrame from the volumes list
volumes_df = pd.DataFrame(unique_volume_number, columns=['volume_number'])
volumes_df['volume_name'] = volumes_df['volume_number'].apply(lambda x: split_volume(x)[0])
volumes_df['volume_year'] = volumes_df['volume_number'].apply(lambda x: split_volume(x)[1])
print("Number of volumes:", len(volumes_df))
display(volumes_df.head(2))

# Write to database
data_ingestion.mysql_writer.insert_volumes(volumes_df)

# Export to Excel
# volumes_df.to_excel('data/volumes.xlsx', index=False)
# print("Volumes has been exported to 'volumes.xlsx'.")

Number of volumes: 39


Unnamed: 0,volume_number,volume_name,volume_year
0,"Volume 19, 2014",Volume 19,2014
1,"Special Issue #8, 2007",Special Issue #8,2007


Inserted volume_number: Volume 19, 2014
Inserted volume_number: Special Issue #8, 2007
Inserted volume_number: Volume 25, 2020
Inserted volume_number: Volume 22, 2017
Inserted volume_number: Volume 14, 2009
Inserted volume_number: Volume 18, 2013
Inserted volume_number: Volume 29, 2024
Inserted volume_number: Volume 23, 2018
Inserted volume_number: Special Issue #7, 2006
Inserted volume_number: Volume 11, 2006
Inserted volume_number: Volume 7, 2002
Inserted volume_number: Special Issue #5, 2006
Inserted volume_number: Special Issue #1, 2005
Inserted volume_number: Volume 24, 2019
Inserted volume_number: Volume 10, 2005
Inserted volume_number: Volume 8, 2003
Inserted volume_number: Volume 6, 2001
Inserted volume_number: Special Issue #6, 2006
Inserted volume_number: Volume 5, 2000
Inserted volume_number: Special Issue #3, 2005
Inserted volume_number: Volume 17, 2012
Inserted volume_number: Special Issue #4, 2006
Inserted volume_number: Volume 20, 2015
Inserted volume_number: Volume 1, 1

### Extract Authors

In [6]:
# Explode Authors

author_article_df = pd.DataFrame()
author_article_df = articles_df[['author','article_url']].copy()
author_article_df.rename(columns={'author': 'author_name'}, inplace=True)
# author_article_df.tail()

# Split authors into lists
author_article_df['author_name'] = author_article_df['author_name'].str.split(', ')

# Explode the authors column into separate rows
author_article_exploded_df = author_article_df.explode('author_name').reset_index(drop=True)

# Clean author
author_article_exploded_df['author_name'] = (author_article_exploded_df['author_name'].str.replace(r'\.', '', regex=True)
                                             .str.replace(r'\s+', ' ', regex=True)  # This replaces multiple spaces with one
                                             .str.strip()  # Remove leading/trailing spaces
                                             )
print(author_article_exploded_df)

# Write to database
data_ingestion.mysql_writer.create_table_authors_articles()
data_ingestion.mysql_writer.insert_authors_articles(author_article_exploded_df)

              author_name                                        article_url
0             Ilkka Tuomi  https://firstmonday.org/ojs/index.php/fm/artic...
1            Pawel Popiel  https://firstmonday.org/ojs/index.php/fm/artic...
2        Mareile Kaufmann  https://firstmonday.org/ojs/index.php/fm/artic...
3            Maura Conway  https://firstmonday.org/ojs/index.php/fm/artic...
4              Paul Bocij  https://firstmonday.org/ojs/index.php/fm/artic...
...                   ...                                                ...
4974      Anthony Ralston  https://firstmonday.org/ojs/index.php/fm/artic...
4975  Guillermo Hernandez  https://firstmonday.org/ojs/index.php/fm/artic...
4976           Miles Dyck  https://firstmonday.org/ojs/index.php/fm/artic...
4977    M Derek MacKenzie  https://firstmonday.org/ojs/index.php/fm/artic...
4978     Sylvie A Quideau  https://firstmonday.org/ojs/index.php/fm/artic...

[4979 rows x 2 columns]
Table 'authors_articles' is ready.
Inserted author_

### Extract Unique Authors

In [7]:
import unicodedata

# Create author_df with unique author names only
author_df = pd.DataFrame(
    author_article_exploded_df['author_name']
    .dropna()  # Drop NaNs
    .loc[author_article_exploded_df['author_name'].str.strip() != '']  # Drop empty strings
    .astype(str)  # Convert any floats (NaNs) to 'nan' string
    .apply(lambda x: unicodedata.normalize('NFKD', x).encode('ASCII', 'ignore').decode('utf-8') if x != 'nan' else '')
    .str.replace(r'\s+', ' ', regex=True)  # Replace multiple spaces with one
    .str.strip() # Remove leading/trailing spaces
    .str.title() # To proper case    
    .unique(),
    columns=['author_name']
)

author_df = author_df.sort_values(by='author_name')
print(author_df)

             author_name
3597            A Dedeke
1853           A Mahmood
3196  A Michael Froomkin
1180      Aaron Delwiche
2447      Aaron K Martin
...                  ...
504          Zeyno Ustun
500        Zilia Estrada
3392         Ziqian Song
2052   Zizi Papacharissi
2749         Zubair Nabi

[3743 rows x 1 columns]


In [8]:
# Write to database
data_ingestion.mysql_writer.insert_authors(author_df)

Inserted author name: A Dedeke
Inserted author name: A Mahmood
Inserted author name: A Michael Froomkin
Inserted author name: Aaron Delwiche
Inserted author name: Aaron K Martin
Inserted author name: Aaron Krowne
Inserted author name: Aaron Strauss
Inserted author name: Aaron Trammell
Inserted author name: Abbe E Forman
Inserted author name: Abbie Monaco
Inserted author name: Abby Smith
Inserted author name: Abdullateef Mohammed
Inserted author name: Abeba Birhane
Inserted author name: Abebe Rorissa
Inserted author name: Abigail Curlew
Inserted author name: Abigail Moreshead
Inserted author name: Abigail Sellen
Inserted author name: Abigail Yao
Inserted author name: Adam Rifkin
Inserted author name: Adam Senft
Inserted author name: Adeline Tay
Inserted author name: Adem Yilmaz
Inserted author name: Adeola Abdulateef Elega
Inserted author name: Adina Gitomer
Inserted author name: Aditi Biswas
Inserted author name: Aditya Johri
Inserted author name: Adriaan Van Der Weel
Inserted author n