In [142]:
import pandas as pd 
from elasticsearch import helpers, Elasticsearch
import re
import string
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer 

# Loading CSV

In [143]:
#fil = pd.read_csv('C:\\elastic_stack\\archive\\metadata.csv')
#data = fil[0:1000]
#data.to_csv('C:\\elastic_stack\\archive\\metadata_mini.csv')
data = pd.read_csv('C:\\elastic_stack\\archive\\metadata_mini.csv')

In [144]:
data = data[['cord_uid','source_x','title','abstract','publish_time','authors','journal','url']]
data = data.dropna()

In [145]:
#here cord_id is unique id, so can be used as an index 
#data['cord_uid'].is_unique

#data12 = data[1:15]
data1 = data.to_dict("records")

In [152]:
#data['cord_uid'].is_unique
#data.shape

False

# Indexing to Elastic Search

In [147]:
mapping = {
    "settings": {
        "number_of_shards": 2,
        "number_of_replicas": 1
    },
    "mappings": {
         "properties": {
                  "cord_uid": {
                      "type": "keyword"
                      },
                  "source_x": {
                      "type": "keyword",
                  },
                  "title": {
                      "type": "text"
                  },
                  "abstract": {
                      "type": "text",
                  },
                  "publish_time": {
                      "type": "date",
                  },
                  "authors": {
                      "type": "text",
                  },
                  "journal": {
                      "type" :"keyword"
                  },
                  "url": {
                      "type" :"keyword"
                  }
              }
    }
}

In [148]:
ENDPOINT = 'http://localhost:9200/'
es = Elasticsearch(hosts=ENDPOINT)
indexName = 'cord19'
es.indices.delete(index=indexName, ignore=[400, 404])
es.indices.create(index=indexName,body=mapping, ignore=400)


{'acknowledged': True, 'shards_acknowledged': True, 'index': 'cord19'}

In [149]:
def generator(data1):
    for ind, line in enumerate(data1):
        yield {
            '_index':indexName,
            '_type':'_doc',
            '_id':line.get("cord_uid",None),
            '_source':{
                "cord_uid":line.get("cord_uid",""),
                "source_x":line.get("source_x",""),
                "title":line.get("title",""),
                "abstract":line.get("abstract",""),
                "publish_time":line.get("publish_time",None),
                "authors":line.get("authors",""),
                "journal":line.get("journal",""),
                "url":line.get("url",""),
            }
        }

In [150]:
try:
    resp = helpers.bulk(es, generator(data1))
    print('indexing success')
except Exception as e:
    print(e.__class__)

indexing success




# Preprocessing

In [164]:
req_get_all = '''{"size" : 1000,"query": {"match_all": {}}}'''
results = es.search(index=indexName, body=req_get_all)['hits']['hits']


954

In [165]:
dict_list = [record.get("_source") for record in results]

In [168]:
data_retrieved = pd.DataFrame(dict_list)

In [211]:
data_retrieved.shape

(954, 9)

In [171]:
def preProcessing(text):
    #lowercase
    text = text.lower()
    
    #remove numbers 
    text = text.translate(str.maketrans('', '', string.digits))
    #remove punctauation
    text= text.translate(str.maketrans("","", string.punctuation))
    
    #white space removal
    text = text.strip()
    text = re.sub(r'[^a-zA-z0-9.,!?/:;\"\'\s]', '', text)
    #Tokenization
    tokenize_text = word_tokenize(text)
    
    #remove stopwords
    stop_words = set(stopwords.words("english")) 
    text = [token for token in tokenize_text if not token in stop_words]
    
    #Morphological Normalization
    stemmer = PorterStemmer()
    text_stemmed  = [stemmer.stem(word) for word in text]
    
    #lem = WordNetLemmatizer()
    #text_lem  = [lem.lemmatize(word) for word in text_stemmed]

    
    return ' '.join(text_stemmed)
    

In [181]:
data_retrieved['text'] = data_retrieved['title'] + data_retrieved['abstract']
data_retrieved['text'] = data_retrieved['text'].apply(lambda x:preProcessing(x))
#docs = data['text'].tolist()

In [183]:
docs_1 = [dat for dat in data_retrieved['text']]
#data_retrieved.head()

In [202]:
tfidf_vectorizer=TfidfVectorizer(use_idf=True)
 
# just send in all your docs here
fitted_vectorizer=tfidf_vectorizer.fit(docs)
tfidf_vectorizer_vectors=fitted_vectorizer.transform(docs)

In [203]:
lis = fitted_vectorizer.get_feature_names()
df = pd.DataFrame(list(tfidf_vectorizer_vectors.toarray()),index=data_retrieved['cord_uid'],columns=lis)

In [212]:
df.shape

(954, 12110)

In [13]:
#req_get_all = '''{"query": {"match_all": {}}}'''
#results = es.search(index=indexName, body=req_get_all)['hits']['hits']

In [14]:
#dis=dict(results[0])

In [31]:
stri = ['hello','welcome','in']
print(' '.join(stri))

hello welcome in
