### The purpose of this notebook is to: text process the title, abstract, and body text columns, capture word count of abstract and body text into new columns, identify which publications mention African/African Americans, export African/African American focused dataset as csv 'cord_africa', and export Non-African/African American focused. dataset as csv 'cord_other' 

In [None]:
import pandas as pd 
import re

In [None]:
cord = pd.read_csv('/content/drive/My Drive/nss_data_science/covid_query/data/cord19_df.csv')

In [None]:
cord = cord[['title','abstract','body_text','authors','journal','publish_year','doi']]

CORD19 dataset from Kaggle has 47,110 rows of data.

In [None]:
#remove rows with missing titles
cord = cord[cord['title'].notna()]

42,192 after removing missing titles

In [None]:
cord = cord.reset_index()

In [None]:
#change nan to Other in journal column
cord["journal"] = cord["journal"].fillna("Other")

In [None]:
#change title, abstract, body_text, and journal to string
cord['title'] = cord['title'].astype(str)
cord['abstract'] = cord['abstract'].astype(str)
cord['body_text'] = cord['body_text'].astype(str)
cord['journal'] = cord['journal'].astype(str)
cord['publish_year'] = cord['publish_year'].astype(str)

In [None]:
#remove punctuation
cord['body_text_processed'] = cord['body_text'].map(lambda x: re.sub('[,\.!?]','',x))

#convert title and body text to lowercase
cord['title_processed'] = cord['title'].map(lambda x: x.lower())
cord['body_text_processed'] = cord['body_text_processed'].map(lambda x: x.lower())
cord['abstract_processed'] = cord['abstract'].map(lambda x: x.lower())

In [None]:
cord['abstract_processed'] = cord['abstract_processed'].fillna('')

In [None]:
#add a word count column for abstract and body text
cord['abstract_word_count'] = cord['abstract'].apply(lambda x: len(x.strip().split()))
cord['body_word_count'] = cord['body_text'].apply(lambda x: len(x.strip().split()))

In [None]:
#filter for publications mentioning African Americans/African
aa_filter =['black people', 'african american people', 'african americans', 'african american','african','africa']
pattern = '|'.join(aa_filter)

cord['demographic'] = cord['abstract_processed'].str.contains(pattern)
cord['demographic'] = cord['demographic'].astype(str)

In [None]:
#change T/F to AA/non-AA in "africa" column for AA/African related publications
cord['demographic'] = cord['demographic'].str.replace("True", "African/African American")
cord['demographic'] = cord['demographic'].str.replace("False", "Non African/African American")

In [None]:
#create DataFrames for african/african american and other
cord_africa = cord[cord['demographic'].str.match('African/African American')]
cord_other = cord[cord['demographic'].str.match('Non African/African American')]

In [None]:
#export cord_africa.csv and cord_other.csv to google drive
cord_africa.to_csv('/content/drive/My Drive/nss_data_science/covid_query/data/cord_africa.csv')
cord_other.to_csv('/content/drive/My Drive/nss_data_science/covid_query/data/cord_other.csv')