### Purpose of this notebook is to the explore the word count, publication year, common journals, and common words of the publications in dataset. 

In [None]:
import pandas as pd 
import matplotlib.pyplot as plt
from matplotlib import style
style.use("ggplot")
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline
import plotly.express as px
from PIL import Image
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords

In [None]:
cord_africa = cord_africa.pd.read_csv('/content/drive/My Drive/nss_data_science/covid_query/data/cord_africa.csv')

In [None]:
cord_other = cord_other.pd.read_csv('/content/drive/My Drive/nss_data_science/covid_query/data/cord_other.csv')

#### Comparison of abstract word count, body word count, and publication year between African/African American and Non-African/African American

In [None]:
#abstract word count by demographic
green_diamond = dict(markerfacecolor='g', marker='D')
ax=cord.boxplot(column=['abstract_word_count'], by='demographic', flierprops=green_diamond)
ax.set_title('Abstract Word Count by Publication Demographic', fontsize=12)
plt.suptitle("")
ax.set_xlabel('Demographic', fontsize=10)
ax.set_ylabel('Word Count', fontsize='medium') 
;

In [None]:
#body word count by demographic
green_diamond = dict(markerfacecolor='g', marker='D')
ax=cord.boxplot(column=['body_word_count'], by='demographic', flierprops=green_diamond)
ax.set_title('Body Text Word Count by Publication Demographic', fontsize=12)
plt.suptitle("")
ax.set_xlabel('Demographic', fontsize=10)
ax.set_ylabel('Word Count', fontsize='medium') ;

In [None]:
#publication year by demographic
cord.groupby(['publish_year','demographic'])['title'].count().unstack().plot()
plt.grid(b=None)
plt.xlim(right=2020)
plt.xlabel ('Publication Year')
plt.ylabel ('Total Publications')
plt.title ('Yearly Publications by Demographic')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.figure(figsize=(20,10))
;

#### Common Words in African/African American publications

In [None]:
def word_bar_graph_function(df,column,title):
    # adapted from https://www.kaggle.com/benhamner/most-common-forum-topic-words
    topic_words = [ z.lower() for y in
                       [ x.split() for x in cord_africa[column] if isinstance(x, str)]
                       for z in y]
    word_count_dict = dict(Counter(topic_words))
    popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
    popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
    plt.barh(range(20), [word_count_dict[w] for w in reversed(popular_words_nonstop[0:20])])
    plt.yticks([x + 0.5 for x in range(20)], reversed(popular_words_nonstop[0:20]))
    plt.xlabel ('Word Frequency in Publication Titles')
    plt.ylabel ('Words')
    plt.title(title)
    plt.show()

In [None]:
plt.figure(figsize=(10,6))
word_bar_graph_function(cord_africa,'title','Top 20 Words in Titles of African/African American Coronavirus-related Publications')

#### Common Journals in African/African American publications

In [None]:
value_counts = cord_africa['journal'].value_counts()
value_counts_df = pd.DataFrame(value_counts)
value_counts_df['journal_name'] = value_counts_df.index
value_counts_df['count'] = value_counts_df['journal']
fig = px.bar(value_counts_df.sort_values('count',ascending=False)[0:20],
             x="count", 
             y="journal_name",
             title='Most Common Journals of African/African American Coronovirus-related Publications ',
             orientation='h',
             labels={'count':'Number of Journals','journal_name':'Journal Name'})

fig.show()

#### Common Words in Non-African/African American publications

In [None]:
def word_bar_graph_function(df,column,title):
    # adapted from https://www.kaggle.com/benhamner/most-common-forum-topic-words
    topic_words = [ z.lower() for y in
                       [ x.split() for x in cord_other[column] if isinstance(x, str)]
                       for z in y]
    word_count_dict = dict(Counter(topic_words))
    popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
    popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
    plt.barh(range(20), [word_count_dict[w] for w in reversed(popular_words_nonstop[0:20])])
    plt.yticks([x + 0.5 for x in range(20)], reversed(popular_words_nonstop[0:20]))
    plt.xlabel ('Word Frequency in Publication Titles')
    plt.ylabel ('Words')
    plt.title(title)
    plt.show()

In [None]:
plt.figure(figsize=(10,6))
word_bar_graph_function(cord_other,'title','Top 20 Words in Titles of Non-African/African American Coronavirus-related Publications')

#### Common Journals in Non-African/African American publications

In [None]:
value_counts = cord_other['journal'].value_counts()
value_counts_df = pd.DataFrame(value_counts)
value_counts_df['journal_name'] = value_counts_df.index
value_counts_df['count'] = value_counts_df['journal']
fig = px.bar(value_counts_df.sort_values('count',ascending=False)[0:20],
             x="count", 
             y="journal_name",
             title='Most Common Journals in Non-African/African American Coronavirus-related Publications',
             orientation='h',
             labels={'count':'Number of Journals','journal_name':'Journal Name'})

fig.show()