In [1]:
#!conda install elasticsearch -y

In [2]:
import csv
import sys
from os import listdir
from os.path import isfile, join
from elasticsearch import Elasticsearch
import json
import certifi
from datetime import datetime
from elasticsearch.helpers import bulk

# Connect to your Elastic Cloud

In [3]:
USER = "elastic"
PASSWORD = "your password"
CloudId = "your cloud id"
ES = Elasticsearch(
    cloud_id= CloudId,
    http_auth=(USER, PASSWORD),
)
ES.ping()


True

# Format Datetime

In [4]:
def mk_date(date):
    date = date.replace("/20", "/2020")
    try:
        return datetime.strptime(date, '%m/%d/%Y %H:%M').isoformat()
    except:
        return date
mk_date("1/30/20 16:50")

'2020-01-30T16:50:00'

# Generate Timestamp

In [5]:
def gen_timestamp(file_name):
    file_name = file_name.replace('.csv', '')
    return datetime.strptime(file_name,'%m-%d-%Y').isoformat()

gen_timestamp('01-22-2020.csv')

'2020-01-22T00:00:00'

# turn string into int

In [6]:
def mk_int(s):
    s = s.strip()
    return int(s) if s else 0

def mk_float(s):
    s = s.strip()
    return float(s) if s else 0

# read CSV file

In [7]:
def read_csv(path, file_name):
    with open(path +"/"+ file_name) as f:
        f_reader = csv.reader(f, delimiter=',')
        line_count = 0
        country_reports = []
        for row in f_reader:
            country_report = {}
            if line_count == 0:
                #read header
                line_count +=1              
            else:
                #set defult
                country_report['FIPS'] = int(0)
                country_report['Admin2'] = 'NA'
                country_report['@timestamp'] = gen_timestamp(file_name)
                country_report['Active'] = int(0)
                country_report['coordinates'] = {'lat': float(0), 'lon': float(0)}
                country_report['Combined_Key'] = 'NA'
                country_report['Incidence_Rate'] = float(0)
                country_report['Case-Fatality_Ratio'] = float(0)
                country_report['Province/State'] = 'NA'
                country_report['Country/Region'] = 'NA'
                country_report['Last Update'] = mk_date("0/00/20 00:00")
                country_report['Confirmed'] = int(0)
                country_report['Deaths'] = int(0)
                country_report['Recovered'] = int(0)
                
                try:
                    #format 1
                    if len(row) == 6:
                        country_report['Province/State'] = row[0]
                        country_report['Country/Region'] = row[1]
                        country_report['Last Update'] = mk_date(row[2])
                        country_report['Confirmed'] = mk_int(row[3])
                        country_report['Deaths'] = mk_int(row[4])
                        country_report['Recovered'] = mk_int(row[5])
                    #format 2
                    if len(row) == 8:
                        country_report['coordinates'] = {'lat':mk_float(row[6]), 'lon':mk_float(row[7])}
                    #format 3
                    if len(row) == 14:
                        country_report['FIPS'] = mk_int(row[0])
                        country_report['Admin2'] = row[1]
                        country_report['Province/State'] = row[2]
                        country_report['Country/Region'] = row[3]
                        country_report['Last Update'] = mk_date(row[4])
                        country_report['coordinates'] = {'lat':mk_float(row[5]), 'lon':mk_float(row[6])}
                        country_report['Confirmed'] = mk_int(row[7])
                        country_report['Deaths'] = mk_int(row[8])
                        country_report['Recovered'] = mk_int(row[9])
                        country_report['Active'] = mk_int(row[10])
                        country_report['Combined_Key'] = row[11]
                        country_report['Incidence_Rate'] = mk_float(row[12])
                        country_report['Case-Fatality_Ratio'] = mk_float(row[13])
                except:
                    print(path +"/"+ file_name)
                    print(row)

                country_reports.append(country_report)
                line_count +=1
    
    return country_reports
    

def get_csv(path):
    only_files = [f for f in listdir(path) if isfile(join(path, f))]
    only_csv = [f for f in only_files if 'csv' in f]
    return only_csv



# put into ES

In [8]:
def extract_date(file_name):
    components = file_name.split('-')
    return '.'.join([components[1], components[0], components[2]]).replace('.csv', '')
        
    
def index_daily_report(file_name, country_reports, elasticsearch, index_name='coronavirus-'):
    if index_name == 'coronavirus-':
        index_name = index_name + extract_date(file_name)
    bulk_list = [{
        '_source': country_report,
        '_op_type':'index', 
        '_index': index_name, 
        '_type':'_doc'
    } for country_report in country_reports ]
    
    res = bulk(ES, bulk_list)
    print(res)
    

# main section

In [9]:
PATH = 'COVID-19/csse_covid_19_data/csse_covid_19_daily_reports'
csvs =  get_csv(PATH)
for file_name in csvs:
    country_reports = read_csv(PATH, file_name)
    index_daily_report(file_name, country_reports,ES)
    



(38, [])
(46, [])
(41, [])
(44, [])
(47, [])
(51, [])
(52, [])
(54, [])
(58, [])
(62, [])
(67, [])
(67, [])
(68, [])
(70, [])
(71, [])
(71, [])
(72, [])
(72, [])
(72, [])
(72, [])
(73, [])
(73, [])
(74, [])
(75, [])
(75, [])
(75, [])
(75, [])
(75, [])
(76, [])
(76, [])
(84, [])
(84, [])
(85, [])
(90, [])
(94, [])
(101, [])
(105, [])
(114, [])
(119, [])
(125, [])
(141, [])
(151, [])
(160, [])
(173, [])
(199, [])
(225, [])
(255, [])
(266, [])
(206, [])
(216, [])
(218, [])
(230, [])
(249, [])
(258, [])
(272, [])
(276, [])
(284, [])
(292, [])
(299, [])
(304, [])
(3417, [])
(3413, [])
(3416, [])
(3420, [])
(3421, [])
(3429, [])
(3430, [])
(3434, [])
(3439, [])
(2434, [])
(2483, [])
(2569, [])
(2624, [])
(2678, [])
(2763, [])
(2808, [])
(2856, [])
(2882, [])
(2910, [])
(2941, [])
(2965, [])
(2988, [])
(3001, [])
(3013, [])
(3026, [])
(3041, [])
(3045, [])
(3053, [])
(3072, [])
(3081, [])
(3091, [])
(3099, [])
(3120, [])
(3128, [])
(3134, [])
(3142, [])
(3152, [])
(3163, [])
(3168, [])
(3177,