# 02 - Data Cleaning

This notebook cleans the raw metadata based on EDA findings.

## Cleaning Steps
- Handle missing values
- Remove duplicates
- Standardize date formats
- Validate and normalize text fields
- Filter out unusable records

In [1]:
# Import required libraries
import sys
from pathlib import Path
import re

import pandas as pd
import numpy as np

# Add project root to path
project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

from src.config import get_settings, ensure_directories

In [2]:
# Load settings and data
settings = get_settings()
ensure_directories(settings)

data_path = settings.raw_data_dir / settings.raw_metadata_file
print(f"Loading raw data from: {data_path}")

df = pd.read_csv(data_path)
print(f"Loaded {len(df):,} records")
print(f"Columns: {list(df.columns)}")

Loading raw data from: c:\Users\alifn\Code\topic-modeling-utama\data\raw\raw_metadata.csv
Loaded 12,647 records
Columns: ['identifier', 'title', 'abstract', 'authors', 'date', 'subjects', 'publisher', 'types', 'language', 'source']


In [3]:
# Initial state
initial_count = len(df)
print(f"\nInitial record count: {initial_count:,}")


Initial record count: 12,647


## 1. Remove Duplicates

In [4]:
# Remove exact duplicates
before = len(df)
df = df.drop_duplicates()
after = len(df)
print(f"Removed {before - after:,} exact duplicates")

# Remove duplicate identifiers (keep first)
before = len(df)
df = df.drop_duplicates(subset=['identifier'], keep='first')
after = len(df)
print(f"Removed {before - after:,} duplicate identifiers")

Removed 0 exact duplicates
Removed 0 duplicate identifiers


In [5]:
# Remove duplicate titles (more aggressive - optional)
# Uncomment if you want to remove records with duplicate titles

before = len(df)
df = df.drop_duplicates(subset=['title'], keep='first')
after = len(df)
print(f"Removed {before - after:,} duplicate titles")

before = len(df)
df = df.drop_duplicates(subset=['abstract'], keep='first')
after = len(df)
print(f"Removed {before - after:,} duplicate abstracts")

Removed 125 duplicate titles
Removed 434 duplicate abstracts


## 2. Handle Missing Values

In [6]:
# Check current missing values
print("Current missing values:")
for col in df.columns:
    missing = df[col].isna().sum()
    if missing > 0:
        print(f"  {col}: {missing:,} ({missing/len(df)*100:.1f}%)")

Current missing values:
  abstract: 1 (0.0%)
  authors: 7 (0.1%)
  subjects: 1,213 (10.0%)
  publisher: 89 (0.7%)
  types: 147 (1.2%)
  language: 2,335 (19.3%)
  source: 12,088 (100.0%)


In [7]:
# Remove records without title (essential field)
before = len(df)
df = df[df['title'].notna() & (df['title'].str.strip() != '')]
after = len(df)
print(f"Removed {before - after:,} records without title")

Removed 0 records without title


In [8]:
# Remove records without abstract (needed for topic modeling)
before = len(df)
df = df[df['abstract'].notna() & (df['abstract'].str.strip() != '')]
after = len(df)
print(f"Removed {before - after:,} records without abstract")

Removed 1 records without abstract


In [9]:
# Fill missing values for non-essential fields
df['authors'] = df['authors'].fillna('Tidak Diketahui')
df['subjects'] = df['subjects'].fillna('Tidak Diketahui')
df['publisher'] = df['publisher'].fillna('Tidak Diketahui')
df['types'] = df['types'].fillna('Tidak Diketahui')
df['language'] = df['language'].fillna('Tidak Diketahui')
df['source'] = df['source'].fillna('Tidak Diketahui')

print("Filled missing values for non-essential fields")

Filled missing values for non-essential fields


## 3. Standardize Date Format

In [10]:
# Parse and standardize dates
def parse_date(date_str):
    """Parse various date formats and extract year."""
    if pd.isna(date_str) or date_str == '':
        return None, None
    
    date_str = str(date_str).strip()
    
    # Try to parse as datetime
    try:
        parsed = pd.to_datetime(date_str)
        return parsed, parsed.year
    except:
        pass
    
    # Extract year using regex
    year_match = re.search(r'(19|20)\d{2}', date_str)
    if year_match:
        year = int(year_match.group())
        return pd.Timestamp(year=year, month=1, day=1), year
    
    return None, None

# Apply parsing
parsed_dates = df['date'].apply(parse_date)
df['parsed_date'] = [d[0] for d in parsed_dates]
df['year'] = [d[1] for d in parsed_dates]

# Statistics
valid_dates = df['year'].notna().sum()
print(f"Successfully parsed dates: {valid_dates:,} ({valid_dates/len(df)*100:.1f}%)")

Successfully parsed dates: 12,087 (100.0%)


In [11]:
# Year distribution
print("\nYear distribution:")
year_counts = df['year'].value_counts().sort_index()
print(year_counts.tail(10))


Year distribution:
year
2016    1041
2017     932
2018     845
2019     729
2020     795
2021    1515
2022     880
2023     327
2024     564
2025     385
Name: count, dtype: int64


## 4. Text Normalization

In [12]:
def normalize_text(text):
    """Normalize text field."""
    if pd.isna(text) or text == '':
        return ''
    
    text = str(text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove leading/trailing whitespace
    text = text.strip()
    
    # Remove control characters
    text = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text)
    
    return text

# Apply normalization
df['title'] = df['title'].apply(normalize_text)
df['abstract'] = df['abstract'].apply(normalize_text)
df['authors'] = df['authors'].apply(normalize_text)
df['subjects'] = df['subjects'].apply(normalize_text)

print("Applied text normalization to title, abstract, authors, subjects")

Applied text normalization to title, abstract, authors, subjects


## 5. Filter Records by Abstract Length

In [13]:
# Calculate word counts
df['abstract_word_count'] = df['abstract'].str.split().str.len()

print("Abstract word count statistics:")
print(df['abstract_word_count'].describe())

Abstract word count statistics:
count    12087.000000
mean       215.682386
std         77.607181
min          1.000000
25%        160.000000
50%        201.000000
75%        261.000000
max       1700.000000
Name: abstract_word_count, dtype: float64


In [14]:
# Remove very short abstracts
MIN_ABSTRACT_WORDS = 20

before = len(df)
df = df[df['abstract_word_count'] >= MIN_ABSTRACT_WORDS]
after = len(df)

print(f"Removed {before - after:,} records with abstract < {MIN_ABSTRACT_WORDS} words")

Removed 10 records with abstract < 20 words


## 6. Final Cleanup

In [15]:
# Reset index
df = df.reset_index(drop=True)

# Select final columns
final_columns = [
    'identifier',
    'title',
    'abstract',
    'authors',
    'date',
    'year',
    'subjects',
    'publisher',
    'types',
    'language',
]

# Keep only existing columns
final_columns = [c for c in final_columns if c in df.columns]
df_clean = df[final_columns].copy()

print(f"Final columns: {list(df_clean.columns)}")

Final columns: ['identifier', 'title', 'abstract', 'authors', 'date', 'year', 'subjects', 'publisher', 'types', 'language']


In [16]:
# Summary
print("\n" + "=" * 60)
print("CLEANING SUMMARY")
print("=" * 60)
print(f"Initial records:  {initial_count:,}")
print(f"Final records:    {len(df_clean):,}")
print(f"Removed:          {initial_count - len(df_clean):,} ({(initial_count - len(df_clean))/initial_count*100:.1f}%)")
print("=" * 60)


CLEANING SUMMARY
Initial records:  12,647
Final records:    12,077
Removed:          570 (4.5%)


## 7. Save Cleaned Data

In [17]:
# Save to processed directory
output_path = settings.processed_data_dir / settings.clean_metadata_file

df_clean.to_csv(output_path, index=False, encoding='utf-8')

print(f"\nâœ… Saved cleaned data to: {output_path}")
print(f"ðŸ“Š Total records: {len(df_clean):,}")


âœ… Saved cleaned data to: c:\Users\alifn\Code\topic-modeling-utama\data\processed\clean_metadata.csv
ðŸ“Š Total records: 12,077


In [18]:
# Preview cleaned data
df_clean.head()

Unnamed: 0,identifier,title,abstract,authors,date,year,subjects,publisher,types,language
0,oai:repository.widyatama.ac.id:123456789/14397,PENGARUH BUDAYA KESELAMATAN DAN KESEHATAN KERJ...,Tujuan penelitian ini adalah untuk mengetahui ...,"Falyana, Diki Hendra",2022-01-05T05:10:37Z,2022,budaya keselamatan dan kesehatan (K3); prosedu...,Program Studi Manajemen S1 Universitas Widyatama,Thesis,other
1,oai:repository.widyatama.ac.id:123456789/859,Pengaruh Kompensasi terhadap Motivasi Kerja Ka...,"Skripsi ini disusun oleh Andri Tanjung, NRP 02...","Tanjung, Andri",2009-03-11T02:35:44Z,2009,Pengaruh Kompensasi terhadap Motivasi Kerja Ka...,Universitas Widyatama,Thesis,other
2,oai:repository.widyatama.ac.id:123456789/5337,PERANAN SISTEM INFORMASI AKUNTANSI DALAM MENUN...,Setiap organisasi didirikan untuk mencapai tuj...,"Setiawan, David",2015-06-17T06:18:20Z,2015,Sistem Informasi Akuntansi; Pengendalian Inter...,Universitas Widyatama,Thesis,other
3,oai:repository.widyatama.ac.id:123456789/107890,PENGARUH USIA DAN MASA KERJA TERHADAP PRODUKTI...,Penelitian ini bertujuan untuk mengetahui peng...,"Fauzia, Galih Eza",2024-04-25T03:35:03Z,2024,Tidak Diketahui,Tidak Diketahui,Thesis,other
4,oai:repository.widyatama.ac.id:123456789/8665,PENGARUH SISTEM PENGENDALIAN INTERNAL PEMERINT...,Penelitian ini bertujuan untuk mengetahui peng...,"Aruan, Hicca Maria Gandi Putri",2017-10-18T23:53:16Z,2017,Sistem Pengendalian Internal Pemerintah; Kuali...,Universitas Widyatama,Thesis,other


In [19]:
# Quick stats
print("\nCleaned data statistics:")
print("-" * 40)
print(f"Records: {len(df_clean):,}")
print(f"Date range: {df_clean['year'].min():.0f} - {df_clean['year'].max():.0f}")
print(f"Avg abstract words: {df_clean['abstract'].str.split().str.len().mean():.0f}")


Cleaned data statistics:
----------------------------------------
Records: 12,077
Date range: 2007 - 2025
Avg abstract words: 216


In [20]:
print(f"\nðŸ‘‰ Next: Run 02b_eda_clean_data.ipynb to analyze the cleaned data")
print(f"   Then: Run 03_preprocessing.ipynb for text preprocessing")


ðŸ‘‰ Next: Run 02b_eda_clean_data.ipynb to analyze the cleaned data
   Then: Run 03_preprocessing.ipynb for text preprocessing
