In [12]:
import pandas as pd
import numpy as np
import csv
import glob
import codecs

In [14]:
def pre_process_txt_file(file_path):
    # Read in the file as UTF-8
    with codecs.open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        text = f.read()
    
    # Remove double quotes and empty lines
    text = text.replace('"', '').strip()
    text = '\n'.join([line for line in text.split('\n') if line.strip() != ''])
    
    # Split into journal, title, date, and content
    lines = text.split('\n')
    if len(lines) < 3:
        return None
    journal = lines[0]
    title = lines[1]
    date = lines[2]
    content = '\n'.join(lines[3:])
    
    # Return the pre-processed data
    return [journal, title, date, content]

In [15]:
def process_txt_files(directory, output_file):
    # Get all .txt files in the directory
    txt_files = glob.glob(directory + '/*.txt')
    
    # Open the output CSV file for writing
    with open(output_file, 'w', newline='') as file:
        writer = csv.writer(file)
        
        # Write the header row
        writer.writerow(['journal', 'title', 'date', 'content'])
        
        # Loop through each .txt file and process it
        for txt_file in txt_files:
            # Use the pre_process_txt_file function to extract the sections from the file
            sections = pre_process_txt_file(txt_file)
            
            # Write the sections as a row in the CSV file
            writer.writerow(sections)

In [16]:
def preprocess_date_column(input_file, output_file):
    # Read in the CSV file
    df = pd.read_csv(input_file)
    
    # Print the number of rows before dropping
    print(f'Number of rows before dropping: {len(df)}')
    
    # Convert the 'date' column to datetime
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    
    # Drop rows where the 'date' column could not be converted
    df.dropna(subset=['date'], inplace=True)
    
    # Print the number of rows after dropping
    print(f'Number of rows after dropping: {len(df)}')
    
    # Write the results to a new CSV file
    df.to_csv(output_file, index=False)

In [17]:
process_txt_files('data/articles', 'input.csv')

In [18]:
preprocess_date_column('input.csv', 'input_cleaned.csv')

Number of rows before dropping: 845
Number of rows after dropping: 680
