In [1]:
import glob
from striprtf.striprtf import rtf_to_text
import re
import pandas as pd
import datetime
import numpy as np
import os

In [2]:
file_name = []
for rtf in glob.glob('data/extracted_docs/*.rtf'):
    #print(rtf)
    file_name.append(rtf)

In [3]:
file_name

["data/extracted_docs\\'A momentous decision'_ What the resolution of Detroit's literacy case could mean for America.rtf",
 "data/extracted_docs\\'A nice uplift for the community'; Fort Chipewyan school receives provincial funding for modernizati.rtf",
 "data/extracted_docs\\'A societal crisis'_ Does this UMKC program have the solution to teacher retention_.rtf",
 "data/extracted_docs\\'Abbott Elementary' School veteran shares what's real, what's not in Philly-set show.rtf",
 "data/extracted_docs\\'FALL INTO TEACHING' EVENT CELEBRATES BOOK RELEASE, PROMOTES SELF-CARE.rtf",
 "data/extracted_docs\\'I'm looking to re-energize and build'_ Clarksdale schools new leader accepts 'the unique set of cha.rtf",
 "data/extracted_docs\\'I'm not going to waste my time on prayers'.rtf",
 "data/extracted_docs\\'I'm not going to waste my time on prayers'_ how secular teachers navigate working in religious scho(2).rtf",
 "data/extracted_docs\\'I'm not going to waste my time on prayers'_ how secular teac

In [4]:
def read_rtf_to_text(file_name):
    '''
    read one file by its name
    :param file_name:
    :return: plain text
    '''
    with open(file_name, 'r') as file:
        file_text = file.read()
        text = rtf_to_text(file_text, errors="ignore")
    return text

In [5]:
articles_list = []
for name in file_name:
    article = read_rtf_to_text(name)
    articles_list.append(article)

In [6]:
# Construct a dataframe
article_names = [s.replace('data/extracted_docs\\', '').replace('.rtf', '').replace('_', '') for s in file_name]
articles_df = pd.DataFrame()
articles_df["text"] = articles_list
articles_df["title"] = article_names

#find load_date and newsoutlet in population
date = []
outlet_column = []

In [7]:
for article in articles_df["text"]:
    date.append(article.partition("Load-Date:")[2].replace("End of Document", "").strip())
articles_df["date"] = date

In [8]:
for article in articles_df["text"]:
    outlet = article.split('\n', 6)[5]
    outlet_column.append(outlet)
articles_df["outlet"] = outlet_column

In [9]:
cleaned_texts = []
for article in articles_df["text"]:
    cleaned_text = article.replace('\n', '').replace('\r', '').replace('\xa0', '')
    cleaned_texts.append(cleaned_text)
articles_df["text"] = cleaned_texts

In [10]:
def mdy_to_ymd(d):
    '''
    Converts a string date to YYYY-MM-DD format
    '''
    return datetime.datetime.strptime(d, '%B %d, %Y').strftime('%Y-%m-%d')

In [11]:
converted_dates = []
index=0
for article in articles_df["date"]:
    #print(index)
    if article != '':
        # Convert string text to valid ISO 8601 format
        formatted_date = mdy_to_ymd(article)
        # Convert to datetime object
        converted_date = datetime.date.fromisoformat(formatted_date)
    else:
        converted_date = np.nan
    converted_dates.append(converted_date)
    #index=index+1
articles_df["datetime_obj"] = converted_dates

In [12]:
articles_df["datetime_obj"].dtypes

dtype('O')

In [13]:
after_covid = []
covid_date = datetime.date(2020, 3, 11)
for i in list(articles_df.index):
    #print(i)
    if articles_df["date"][i] != '':
        if articles_df["datetime_obj"][i] < covid_date:
            after_covid.append(0)
        else:
            after_covid.append(1)
    else:
        after_covid.append(np.nan)
articles_df["covid"] = after_covid

In [14]:
articles_df.covid.value_counts()

1.0    922
0.0    425
Name: covid, dtype: int64

In [15]:
articles_df

Unnamed: 0,text,title,date,outlet,datetime_obj,covid
0,'A momentous decision': What the resolution of...,'A momentous decision' What the resolution of ...,"May 14, 2020",Newstex Blogs,2020-05-14,1.0
1,'A nice uplift for the community'; Fort Chipew...,'A nice uplift for the community'; Fort Chipew...,"November 7, 2019",Fort McMurray Today,2019-11-07,0.0
2,'A societal crisis': Does this UMKC program ha...,'A societal crisis' Does this UMKC program hav...,"December 7, 2021",Newstex Blogs,2021-12-07,1.0
3,'Abbott Elementary'; School veteran shares wha...,'Abbott Elementary' School veteran shares what...,"December 7, 2021",The Philadelphia Inquirer,2021-12-07,1.0
4,'FALL INTO TEACHING' EVENT CELEBRATES BOOK REL...,'FALL INTO TEACHING' EVENT CELEBRATES BOOK REL...,"September 6, 2019",US Fed News,2019-09-06,0.0
...,...,...,...,...,...,...
1498,Youngkin outlines steps to address teacher sho...,Youngkin outlines steps to address teacher sho...,"September 1, 2022",The Breeze: James Madison University,2022-09-01,1.0
1499,: 'It's been a while since we've had this many...,'It's been a while since we've had this many ...,"August 8, 2021",Newstex Blogs,2021-08-08,1.0
1500,": WARREN, CHILD CARE PROVIDER, SMALL BUSINESS ...","WARREN, CHILD CARE PROVIDER, SMALL BUSINESS O...","April 28, 2022",States News Service,2022-04-28,1.0
1501,"""Mixed"" teaching: the new normality of higher ...",Mixed teaching the new normality of higher edu...,"March 11, 2021",CE Noticias Financieras English,2021-03-11,1.0


In [16]:
#os.makedirs('data', exist_ok=True)  
articles_df.to_csv('data/articles_df.csv', index=False)