## Importing Packages
### Dependency
#### 1. sklearn, pandas and spacy required
#### 2. Relies on paper data

In [1]:
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import spacy

## Reading Data

In [2]:
df = pd.read_pickle("./df.pkl")

## Pre-processing

In [3]:
nlp = spacy.load("en", disable=['parser', 'ner'])
random_state = 0

In [4]:
# keep noun words
def only_nouns(texts):
    output = []
    for doc in nlp.pipe(texts):
        noun_text = " ".join(token.lemma_ for token in doc if token.pos_ == 'NOUN')
        output.append(noun_text)
    return output


df['text'] = only_nouns(df['text'])

df.head()

Unnamed: 0,text
0,coronavirus outbreak point source exposure eve...
1,outbreak coronavirus people death cure covid-1...
2,background endemic week coronavirus novel outb...
3,airline network role importation disease infor...
4,beginning emergence outbreak coronavirus need ...


## Model Building

In [5]:
# number of topics to extract
n_topics = 3 # based on cluster size
vec = TfidfVectorizer(max_features=5000, stop_words="english", max_df=0.95, min_df=2)
features = vec.fit_transform(df.text)

cls = NMF(n_components=n_topics, random_state=random_state)
cls.fit(features)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=3, random_state=0, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

## Topic Identification

In [6]:
# list of unique words found by the vectorizer
feature_names = vec.get_feature_names()

# number of most influencing words to display per topic
n_top_words = 15
print("Topic Details")
for i, topic_vec in enumerate(cls.components_):
    print(i+1, end=' ')
    for fid in topic_vec.argsort()[-1:-n_top_words-1:-1]:
        print(feature_names[fid], end=' ')
    print()

Topic Details
1 study patient trial author manuscript research guideline statement datum participant material approval symptom consent body 
2 case number model epidemic transmission outbreak measure country city study datum time day reproduction control 
3 protein cell virus receptor drug coronavirus sequence infection host vaccine structure epitope spike human coronaviruse 


## 3 Identifiable topics
### 1: Trial studies: It talks about the current studies and trials undergoing to curb the pandemic.
### 2: Effect of coronvirus: It summarizes on how the virus is affecting various countries and the entire humanity
### 3: Coronovirus Structure: It focusses on the virus composition, modus operandi, etc.