# Exploratory Data Analysis (EDA)

**Installing and importing required libraries**

In [None]:
df_final=pd.read_csv('FINAL_CORD_DATA.csv')

Inspection of final data

In [None]:
#Inspect data
def inspect_data(data):
    return pd.DataFrame(
        {
            "Data Type": data.dtypes,
            "No of levels": data.apply(lambda x:len(x.unique()), axis=0),
            "levels": data.apply(lambda x:x.unique(), axis=0),
            "qtd_null": data.apply(lambda x: x.isnull().sum(), axis=0),
            "pct_null": data.apply(lambda x: x.isnull().sum() * 100 / len(x))
        }
    )

In [8]:
inspect_data(df_final)

In [None]:
df_final['publish_time'] = pd.to_datetime(df_final['publish_time'])

In [None]:
df_final['publish_year'] =  df_final['publish_time'].dt.year
df_final['publish_month'] =  df_final['publish_time'].dt.month
df_final['publish_day'] =  df_final['publish_time'].dt.day


In [None]:
df_final['publish_year'].nunique()

51

In [None]:
df_final['publish_year'].min()

1970

In [None]:
df_final['publish_year'].max()

2021

In [3]:
# No.of papers published in recent years
sns.countplot(x="publish_year", data=df_final[df_final['publish_year'].isin([2018, 2019,2020,2021, 2022])])

In [2]:
#Month wise distribution of papers published
sns.countplot(x="publish_month", data=df_final)

In [None]:
df_final['publish_month'] = df_final['publish_month'].apply(lambda x : calendar.month_abbr[x])

In [4]:
sns.countplot(x="publish_month", data=df_final)

In [7]:
#Checking % of papers published in recent times vs in earlier years
df_final['in_covid_time'] = np.where(df_final['publish_year'].isin([2019,2020,2021]), 1, 0)
sns.countplot(df_final['in_covid_time'])

**Looking at world clouds**

In [None]:
!pip3 install wordcloud

In [None]:
from wordcloud import WordCloud, STOPWORDS 
import re
import string

In [None]:
def plot_word_cloud(wordcloud):
    plt.figure(figsize=(10, 10))
    plt.imshow(wordcloud) 
    plt.axis("off");

In [None]:
def remove_special_charac_and_punctuations(text) :
  # define the pattern to keep
  pattern = r'[^a-zA-z0-9\'\s]' 
  txt = re.sub(pattern, '', text)
  txt = ''.join([c for c in txt if c not in string.punctuation])
  return  txt

In [None]:
def concat_all_sentences(sents):
   all_tokens = ''
   for text in sents:
        tx = str(text).lower().strip()
        tx = " ".join(tx.split())
        all_tokens += tx + " "
   return all_tokens  

In [None]:
cleaned_titles = df_final.loc[df_final.title.notnull(), 'title'].apply(remove_special_charac_and_punctuations)

In [None]:
cleaned_abstracts = df_final.loc[df_final.abstract.notnull(), 'abstract'].apply(remove_special_charac_and_punctuations)

In [None]:
title_tokens = concat_all_sentences(cleaned_titles)

In [None]:
abstract_tokens = concat_all_sentences(cleaned_abstracts)

In [9]:
title_tokens

In [10]:
abstract_tokens

In [1]:
#Looking at word cloud of title tokens
title_wordcloud = WordCloud(width = 500, height = 500,random_state=10).generate(title_tokens)
plot_word_cloud(title_wordcloud)

## Topic modelling

In [None]:
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import CoherenceModel

In [11]:
cleaned_abstracts

In [None]:
abstracts = cleaned_abstracts.values.tolist()

In [None]:
#simple_preprocess of genism removes special charcters,numbers and converts into lower case
gensim.utils.simple_preprocess("12 happy today @# '' '\n'", deacc=True) 

['happy', 'today']

In [None]:
def token_builder(sentences) :
   for sent in sentences:     
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True) 
        yield(sent)  



In [None]:
#Building abstract tokens using token_builder
abstract_tokens = list(token_builder(abstracts))

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(abstract_tokens, min_count=3, threshold=50) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[abstract_tokens], threshold=50)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)



In [None]:
import nltk as nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])

In [None]:
def process_docs(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []
    nlp = spacy.load('en', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    # remove stopwords once more after lemmatization   
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out] 
    return texts_out


In [None]:
abstracts_data = process_docs(abstract_tokens)

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(abstracts_data)

In [None]:
print(id2word)

Dictionary(57043 unique tokens: ['aim', 'allow', 'automatic', 'calculate', 'cautious']...)


In [None]:
print(id2word.token2id)



In [None]:
# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in abstracts_data]

In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=2, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=10,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=100,
                                           per_word_topics=True)

In [13]:
print(lda_model.print_topics())

In [None]:
import matplotlib.pyplot as plt

In [6]:
import matplotlib.colors as mcolors

cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(stopwords=stop_words,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = lda_model.show_topics(formatted=False)

for i in range(0,2):
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')
    plt.show()

In [None]:
topics = lda_model.show_topics(formatted=False)

In [14]:
topics