In [77]:
import pandas as pd
import numpy as np
import csv
import glob
import codecs

In [78]:
def pre_process_txt_file(file_path):
    # Read in the file as UTF-8
    with codecs.open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        text = f.read()
    
    # Remove double quotes and empty lines
    text = text.replace('"', '').strip()
    text = text.replace(',', '').strip()
    text = '\n'.join([line for line in text.split('\n') if line.strip() != ''])
    
    # Split into journal, title, date, and content
    lines = text.split('\n')
    if len(lines) < 3:
        return None
    journal = lines[0]
    title = lines[1]
    date = lines[2]
    content = ''.join(lines[3:])
    content.replace('\n', '')
    
    # Return the pre-processed data
    return [journal, title, date, content]

In [79]:
def process_txt_files(directory, output_file):
    # Get all .txt files in the directory
    txt_files = glob.glob(directory + '/*.txt')
    
    # Open the output CSV file for writing
    with open(output_file, 'w', newline='') as file:
        writer = csv.writer(file)
        
        # Write the header row
        writer.writerow(['journal', 'title', 'date', 'content'])
        
        # Loop through each .txt file and process it
        for txt_file in txt_files:
            # Use the pre_process_txt_file function to extract the sections from the file
            sections = pre_process_txt_file(txt_file)
            
            # Write the sections as a row in the CSV file
            writer.writerow(sections)

In [80]:
def preprocess_date_column(input_file, output_file):
    # Read in the CSV file
    df = pd.read_csv(input_file)
    
    # Print the number of rows before dropping
    print(f'Number of rows before dropping: {len(df)}')
    
    # Convert the 'date' column to datetime
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    
    # Drop rows where the 'date' column could not be converted
    df.dropna(subset=['date'], inplace=True)
    
    # Print the number of rows after dropping
    print(f'Number of rows after dropping: {len(df)}')
    
    # Write the results to a new CSV file
    df.to_csv(output_file, index=False)
    return df

In [81]:
process_txt_files('data/articles', 'results/input.csv')

In [82]:
df = preprocess_date_column('results/input.csv', 'results/input_cleaned.csv')

Number of rows before dropping: 843
Number of rows after dropping: 678


In [83]:
df.head()

Unnamed: 0,journal,title,date,content
1,The Tulip,THE PLACE OF THE BEND TISKELE OF THE ROCKS GAS...,2000-08-16,ABILA Kronos - a explosion at surrounding the ...
2,Modern Rubicon,ON SCENE BLOG,2014-01-20,MODERNIZATION 1947 - from the news conference ...
3,International News,Police Hold News Conference on GAStech Kidnapp...,2014-01-21,ABILA Kronos - The Abila police held a press c...
4,Tethys News,To break off itself: The emergency to GAStech ...,2014-01-20,Modernization 1:40 PM: There are puttinges in...
5,Worldwise,Annual ends of gathering of POK in the riot still,2013-06-22,ABILA Kronos - the members of POK held their a...


In [84]:
df['content'].str.len()

1      1226
2       667
3       526
4       266
5      1189
       ... 
837     294
838     208
839     298
840    1092
842     927
Name: content, Length: 678, dtype: int64

In [85]:
with open('results/input_cleaned.csv', 'r') as f:
    text = f.read()
    lines = text.split('\n')
    for line in lines:
        if '\n' in line:
            print('!')