In [161]:
import pandas as pd
import glob
import json
import numpy as np 
import spacy
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from nltk.tokenize import sent_tokenize, word_tokenize

In [162]:
# random functions
def get_random():
    files = glob.glob(r'CORD-19-research-challenge/**/pdf_json/*.json', recursive=True)
    random_files = np.random.choice(files, int(len(files)*.1))
    return random_files

In [164]:
# Write a file reader a file and tokenize the data.

def reader ():
    File = get_random()
    dataframe = {"title": [],"text_abstract":[], "text_body": []}
    dataframe = pd.DataFrame.from_dict(dataframe)
    for i, file in enumerate (File):
        tuples = {"title": None, "text_abstract": None,"text_body": None}
        with open(file) as json_data:
            
            data = json.load(json_data)
        
            tuples['title']=data['metadata']['title']
               
            abstract_text= []
            body_text = []
        
            for a in data['abstract']:
                abstract_text.append(a['text'])
                
       
            for b in data['body_text']:
                body_text.append(b['text'])
            

            body = "\n ".join(body_text)
            abstract = "\n". join(abstract_text)
            tuples["text_abstract"] = abstract
            tuples['text_body']=body 
            dataframe = dataframe.append(tuples, ignore_index=True)

    return dataframe

In [165]:
df = reader()

In [166]:
stop_words = nltk.corpus.stopwords.words('english')
stop_words
stop_words[:20]
len(stop_words)

179

In [167]:
def normalize(txt):
    #txt = reader()
    txt = re.sub(r'[^a-zA-Z0-9\s]', ' ', str(txt), re.I|re.A)
    txt = re.sub("(^|\W)\d+($|\W)", " ", txt)   #remove whitespace and numbers
    txt = txt.replace('title', '')       #remove 'title'
    txt = re.sub('[!#?%*&$)@^(.,-=+:";]', '', txt)       #remove punctuation
    txt = re.sub(r"\b[a-zA-Z]\b",'',txt)        #remove single letters
    txt = re.sub(r'\d+', '', txt)
    txt = re.sub(r'\\b[A-Z a-z 0-9._ - ]*[@](.*?)[.]{1,3} \\b', '', txt)#remove email
    txt = txt.replace('introduction', '')       #remove 'introduction'
    txt = txt.replace('text', '')
    txt = txt.replace('background', '')         #remove 'background'
    txt = txt.replace('abstract', '') 
    txt = txt.replace('\\n', ' ')   
    txt = txt.replace('\n', ' ')
    txt = txt.replace('///', ' ') 
    txt = txt.replace("'", '')
    txt = re.sub(r'^\w\w?$', '',txt) 
    txt = txt.lower()
    return txt

In [169]:
# Function for tokenization, lemmatization
def Senttokenizing(txt):
    sentence = normalize(txt)
    x = nltk.sent_tokenize(sentence)
    sent_tokens = [t for t in x if t not in stop_words]
    clean_token = ' '.join(sent_tokens)
    return clean_token



In [170]:
from nltk.stem import WordNetLemmatizer
def wordtokenizing(txt):
    wordnet_lemmatizer = WordNetLemmatizer()
    word = Senttokenizing(txt)
    tokens = nltk.word_tokenize(word)
    word_tokens = [t for t in tokens if t not in stop_words]
    lemmatized_word = [wordnet_lemmatizer.lemmatize(word) for word in word_tokens]
    return ' '.join(word_tokens)

In [None]:
df['title'] = df['title'].apply(lambda x: wordtokenizing(x))
df['text_abstract'] = df['text_abstract'].apply(lambda x: wordtokenizing(x))
df['text_body'] = df['text_body'].apply(lambda x: wordtokenizing(x))

In [171]:
normalize_corpus = np.vectorize(wordtokenizing, otypes=[np.ndarray])

In [172]:
norm_corpus = normalize_corpus(list(df['text_body']))

In [95]:
len(norm_corpus )

4594

In [176]:

tf = TfidfVectorizer(preprocessor = wordtokenizing,stop_words=stop_words)
tfidf_matrix = tf.fit_transform(norm_corpus)

tfidf_matrix.shape

(4594, 269169)

In [175]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95, random_state=42)
X_reduced= pca.fit_transform(tfidf_matrix.toarray())
X_reduced.shape

MemoryError: 

In [177]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=10,init='k-means++')

km.fit(tfidf_matrix)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=10, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [178]:
%time
%timeit
km.labels_

Wall time: 0 ns


array([8, 6, 8, ..., 6, 6, 3])

In [179]:
df['kmeans_cluster'] = km.labels_

In [181]:
from sklearn.metrics import silhouette_score
score_max = -1 #this is the minimum possible score
for k in range(1,10):
    silhout= silhouette_score(tfidf_matrix,  df['kmeans_cluster'])
    print ("For n_clusters = {}, silhouette score is {})".format(k,silhout))
   


For n_clusters = 1, silhouette score is 0.01544955542890592)
For n_clusters = 2, silhouette score is 0.01544955542890592)
For n_clusters = 3, silhouette score is 0.01544955542890592)
For n_clusters = 4, silhouette score is 0.01544955542890592)
For n_clusters = 5, silhouette score is 0.01544955542890592)
For n_clusters = 6, silhouette score is 0.01544955542890592)
For n_clusters = 7, silhouette score is 0.01544955542890592)
For n_clusters = 8, silhouette score is 0.01544955542890592)
For n_clusters = 9, silhouette score is 0.01544955542890592)


In [183]:
text_clusters = (df[['kmeans_cluster', 'text_body']]
                  .sort_values(by=['kmeans_cluster', 'text_body'], 
                               ascending=False)
                  .groupby('kmeans_cluster').head(20))

In [184]:
topn_features = 9
ordered_centroids = km.cluster_centers_.argsort()[:, ::-1]
ordered_centroids

array([[ 57441, 131699,  73495, ..., 174696, 174695, 134584],
       [179146,  41230, 207406, ..., 169985, 169984, 134584],
       [147563, 147570,  51510, ..., 177174, 177173, 134584],
       ...,
       [ 38776, 149349, 258522, ..., 166719, 166718, 134584],
       [102871, 198342,  64136, ..., 167234, 167232, 134584],
       [ 51581, 179146, 215394, ..., 176363, 176362, 134584]], dtype=int64)

In [185]:
feature_names = tf.get_feature_names()

In [None]:
for cluster_num in range(10):
    key_features = [feature_names[index] 
                        for index in ordered_centroids[cluster_num, :topn_features]]
    texts =text_clusters[text_clusters['kmeans_cluster'] == cluster_num]['text_body'].values.tolist()
    print('CLUSTER #'+str(cluster_num+1))
    print('-'*60)
    print('Key Features:', key_features)
    print('Text :', texts)
    print('-'*80)

CLUSTER #1
------------------------------------------------------------
Key Features: ['de', 'la', 'en', 'les', 'des', 'que', 'el', 'une', 'un']
Text : ['© ELSEVIER. Fotocopiar sin autorización es un delito. El principal motivo de consulta en pediatría lo constituyen las enfermedades infecciosas, fundamentalmente de vías respiratorias altas. Su etiología habitual es la vírica. Los niños pueden presentar habitualmente hasta 6-8 infecciones de vías respiratorias al año, sobre todo los prees colares y los que acuden a guarderías, con un número de episodios aún mayor. Los virus más frecuentemente asociados en enfermedad respiratoria en el niño son los rinovirus, parainfluenza, virus respiratorio sincitial (VRS), virus de la gripe y adenovirus. Otros virus de identificación reciente implicados en la patogénesis de infecciones de vías respiratorias son el bocavirus humano o los coronavirus.\n Suelen tener picos estacionales, siendo la mayoría más propios de meses fríos. Los adenovirus pueden

In [182]:
str1 = ''.join(text_clusters)


In [None]:
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
for cluster_num in range(0,10):
    texts =text_clusters[text_clusters['kmeans_cluster'] == cluster_num]['text_body'].values.tolist()
    print('CLUSTER #'+str(cluster_num+1))
    print('-'*80)
    
       
    text_summary = wordtokenizing(texts)

    tf = TfidfVectorizer()
    text_summary  = [text_summary ]
    tf_mat = tf.fit_transform(text_summary)

    matrix = (tf_mat*tf_mat.T)
    nx_form =nx.from_scipy_sparse_matrix(matrix)

    ranks =nx.pagerank(nx_form)

    similairity = np.zeros([len(text_summary), len(text_summary)])
    from sklearn.metrics.pairwise import cosine_similarity
    cosin = cosine_similarity(fitting[0:1], fitting)
    trans = cosin  * cosin .T

    nx_graph = nx.from_numpy_array(trans)
    s = nx.pagerank(nx_graph)
    ranked_sentences = sorted(((s[i],n) for i,n in enumerate(text_summary)), reverse=True)
    print("Indexes of top ranked_sentence order are ", ranked_sentences)


CLUSTER #1
--------------------------------------------------------------------------------
Indexes of top ranked_sentence order are  [(1.0, 'elsevier fotocopiar sin autorizaci es un delito el principal motivo de consulta en pediatr lo constituyen las enfermedades infecciosas fundamentalmente de respiratorias altas su etiolog habitual es la rica los ni os pueden presentar habitualmente hasta infecciones de respiratorias al sobre todo los prees colares los que acuden guarder con un mero de episodios mayor los virus frecuentemente asociados en enfermedad respiratoria en el ni son los rinovirus parainfluenza virus respiratorio sincitial vrs virus de la gripe adenovirus otros virus de identificaci reciente implicados en la patog nesis de infecciones de respiratorias son el bocavirus humano los coronavirus suelen tener picos estacionales siendo la mayor propios de meses fr os los adenovirus pueden manifestarse de forma continua en cualquier poca del en muchos casos su presentaci puede acont

In [None]:
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
for cluster_num in range(0,10):
    texts =text_clusters[text_clusters['kmeans_cluster'] == cluster_num]['text_body'].values.tolist()
    print('CLUSTER #'+str(cluster_num+1))
    print('-'*80)
    
       
    text_summary = wordtokenizing(texts)

    tf = TfidfVectorizer()
    text_summary  = [text_summary ]
    tf_mat = tf.fit_transform(text_summary)

    matrix = (tf_mat*tf_mat.T)
    nx_form =nx.from_scipy_sparse_matrix(matrix)

    ranks =nx.pagerank(nx_form)

    similairity = np.zeros([len(text_summary), len(text_summary)])
    from sklearn.metrics.pairwise import cosine_similarity
    cosin = cosine_similarity(fitting[0:1], fitting)
    trans = cosin  * cosin .T

    nx_graph = nx.from_numpy_array(trans)
    s = nx.pagerank(nx_graph)
    ranked_sentences = sorted(((s[i],n) for i,n in enumerate(text_summary)), reverse=True)
    
    for i in range(len(texts)):
        print(ranked_sentences[cluster_num][1],"\n")
    


CLUSTER #1
--------------------------------------------------------------------------------
elsevier fotocopiar sin autorizaci es un delito el principal motivo de consulta en pediatr lo constituyen las enfermedades infecciosas fundamentalmente de respiratorias altas su etiolog habitual es la rica los ni os pueden presentar habitualmente hasta infecciones de respiratorias al sobre todo los prees colares los que acuden guarder con un mero de episodios mayor los virus frecuentemente asociados en enfermedad respiratoria en el ni son los rinovirus parainfluenza virus respiratorio sincitial vrs virus de la gripe adenovirus otros virus de identificaci reciente implicados en la patog nesis de infecciones de respiratorias son el bocavirus humano los coronavirus suelen tener picos estacionales siendo la mayor propios de meses fr os los adenovirus pueden manifestarse de forma continua en cualquier poca del en muchos casos su presentaci puede acontecer en forma de brotes epid micos las modernas cn

In [None]:
for cluster_num in range(0,10):
    key_features = [feature_names[index] 
                        for index in ordered_centroids[cluster_num, :topn_features]]
    texts =text_clusters[text_clusters['kmeans_cluster'] == cluster_num]['text_body'].values.tolist()
    print('CLUSTER #'+str(cluster_num+1))
 
    print('-'*80)
    nlp = spacy.load("en_core_web_sm")

# add PyTextRank to the spaCy pipeline
    tr = pytextrank.TextRank()
    nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)

    doc = nlp(str(texts))
        
     
    for token in doc._.phrases:
        print(token.text, '|', token.rank)


CLUSTER #1
--------------------------------------------------------------------------------
de la patogenicidad de infección y la tasa de | 0.1336106932890917
de croissance de la | 0.1326728174804603
de sevrage de la | 0.13265869969647384
de standardisation de la | 0.13265776916017938
de moduler la date de la | 0.1306304370944047
ou la n6gation de la maladie de la part | 0.12512054461045058
de entrada de la infección | 0.12510685655034756
la pandemia de gripe de | 0.12498592378560215
de incubación de | 0.1249420376052109
la patogénesis de infecciones de | 0.12490110099587722
de la tasa de ataque | 0.12488339790816752
de consolidación de | 0.12488053397071625
des recommandations de la conférence de consensus de 1998.\n | 0.12313779293815016
que dependerán de la evolución de la epidemia | 0.12292273807040888
la transmisión de la | 0.1222823576785202
la voie de la | 0.12219107161418558
la fréquence de la | 0.1221167360689038
la intensidad de la | 0.12209462449779464
la rapidité de la | 0.

In [None]:
for cluster_num in range(0,10):
    key_features = [feature_names[index] 
                        for index in ordered_centroids[cluster_num, :topn_features]]
    texts =text_clusters[text_clusters['kmeans_cluster'] == cluster_num]['text_body'].values.tolist()
    print('CLUSTER #'+str(cluster_num+1))
 
    print('-'*80)
    nlp = spacy.load("en_core_web_sm")

# add PyTextRank to the spaCy pipeline
    tr = pytextrank.TextRank()
    nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)

    doc = nlp(str(texts))

    for sent in doc._.textrank.summary(limit_phrases=20, limit_sentences=5):
        print(sent)