# EDA and ETL for scraped data from IPM and AskExtension data knowledge

In [1]:
import os
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

COLOR = 'white'
plt.rcParams['text.color'       ] = COLOR
plt.rcParams['text.color'       ] = COLOR
plt.rcParams['axes.labelcolor'  ] = COLOR
plt.rcParams['xtick.color'      ] = COLOR
plt.rcParams['ytick.color'      ] = COLOR

# IPM data - December 2021 Scrape

In [2]:
'''
['exoticPests.json',
 'fruitItems_new.json',
 'fruitVeggieEnvironItems_new.json',
 'pestDiseaseItems_new.json',
 'plantFlowerItems.json',
 'turfPests.json',
 'veggieItems_new.json',
 'weedItems.json']
'''
_PATH = '../data/'
DATA_FILE_NAMES = sorted(os.listdir(_PATH))
DATA_FILE_NAMES

['FruitVegCulturalItems.json',
 'GardenControlsPestItems.json',
 'GardenControlsPesticideItems.json',
 'PestNotes.json',
 'QuickTips.json',
 'Videos.json',
 'WeedIdItems.json',
 'exoticPests.json',
 'fruitItems_new.json',
 'fruitVeggieEnvironItems_new.json',
 'pestDiseaseItems_new.json',
 'plantFlowerItems.json',
 'turfPests.json',
 'veggieItems_new.json',
 'weedItems.json']

## ETL of data

In [3]:
finalDf = pd.DataFrame()
cols = ['source', 'url', 'title', 'description', 'identification', 'development', 'damage', 'management', 'links']


def pestsDiseases():
    # -------------------------------------------- Pests diseases
    print(f'Merging pests diseases...')
    FILE_NAME = 'pestDiseaseItems_new.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url description identification life_cycle damage solutions images
    final schema:
    source url title description identification development damage management links
    '''

    df['source'] = 'pestsDiseases'

    df.rename(columns = {
        'name'          : 'title'       ,
        'life_cycle'    : 'development' ,
        'damagePestNote': 'damage'      ,
        'solutions'     : 'management'  ,
        'images'        : 'links'       ,
    }, inplace = True)


    df['links'] = df.apply(lambda r: [
        {
            'type'  : 'image'   , 
            'src'   : i['src']  , 
            'link'  : i['link'] ,
            'title' : r['title'] + ' - ' + i['caption']
        } for i in r['links'] if len(i['caption']) > 0], axis = 1)

    df = df[cols]

    return df

df      = pestsDiseases()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def pestsTurf():
    # -------------------------------------------- Turf pests
    print(f'Merging turf pests...')
    FILE_NAME = 'turfPests.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url text images
    final schema:
    source url title description identification development damage management links
    '''

    df['source'         ] = 'pestsTurf'
    df['identification' ] = ''
    df['development'    ] = ''
    df['damage'         ] = ''
    df['management'     ] = ''

    df.rename(columns = {
        'name'  : 'title'       ,
        'text'  : 'description' ,
        'images': 'links'       ,
    }, inplace = True)


    df['links'] = df.apply(lambda r: [
        {
            'type'  : 'image'   , 
            'src'   : i['src']  , 
            'link'  : i['link'] ,
            'title' : r['title'] + ' - ' + i['caption']
        } for i in r['links'] if len(i['caption']) > 0], axis = 1)

    df = df[cols]

    return df

df      = pestsTurf()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def pestsExotic():
    # -------------------------------------------- Exotic pests
    print(f'Merging exotic pests...')
    FILE_NAME = 'exoticPests.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url description damage identification life_cycle monitoring management related_links images
    final schema:
    source url title description identification development damage management links
    '''

    df['source'] = 'pestsExotic'

    df.rename(columns = {
        'name'      : 'title'       ,
        'life_cycle': 'development' ,
        'images'    : 'links'       ,
    }, inplace = True)

    df['links'] = df['links'].apply(lambda d: d if isinstance(d, list) else [])

    df['links'] = df.apply(lambda r: [
        {
            'type'  : 'image'   , 
            'src'   : i['src']  , 
            'link'  : i['link'] ,
            'title' : r['title'] + ' - ' + i['caption']
        } for i in r['links'] if len(i['caption']) > 0], axis = 1)
    
    df['related_links'] = df.apply(lambda r: [
        {
            'type'  : 'page'   , 
            'src'   : i['link'] , 
            'link'  : ''        ,
            'title' : r['title'] + ' - ' + i['text']
        } for i in r['related_links'] if len(i['text']) > 0], axis = 1)

    df.apply(lambda x: x['links'].extend(x['related_links' ]), axis = 1)

    df = df[cols]

    return df

df      = pestsExotic()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def damagesEnvironment():
    # -------------------------------------------- Fruit and veggie damages
    print(f'Merging fruit and veggie damages...')
    FILE_NAME = 'fruitVeggieEnvironItems_new.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url description identification damage disorder_development solutions images
    final schema:
    source url title description identification development damage management links
    '''

    df['source'] = 'damagesEnvironment'

    df.rename(columns = {
        'name'                  : 'title'       ,
        'disorder_development'  : 'development' ,
        'solutions'             : 'management' ,
        'images'                : 'links'      ,
    }, inplace = True)

    df['links'] = df.apply(lambda r: [
        {
            'type'  : 'image'   , 
            'src'   : i['src']  , 
            'link'  : i['link'] ,
            'title' : r['title'] + ' - ' + i['caption']
        } for i in r['links'] if len(i['caption']) > 0], axis = 1)

    df = df[cols]

    return df

df      = damagesEnvironment()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def damagesWeed():
    # -------------------------------------------- Weed damages
    print(f'Merging weed damages...')
    FILE_NAME = 'weedItems.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url description images
    final schema:
    source url title description identification development damage management links
    '''

    df['source'         ] = 'damagesWeed'
    df['identification' ] = ''
    df['development'    ] = ''
    df['damage'         ] = ''
    df['management'     ] = ''

    df.rename(columns = {
        'name'  : 'title',
        'images': 'links'
    }, inplace = True)

    df['links'] = df.apply(lambda r: [
        {
            'type'  : 'image'   , 
            'src'   : i['link'] , 
            'link'  : ''        ,
            'title' : r['title'] + ' - ' + i['caption']
        } for i in r['links'] if len(i['caption']) > 0], axis = 1)

    df = df[cols]

    return df

df      = damagesWeed()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def infoFruits():
    # -------------------------------------------- Fruits information
    print(f'Merging fruits information...')
    FILE_NAME = 'fruitItems_new.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url cultural_tips pests_and_disorders
    final schema:
    source url title description identification development damage management links
    '''

    df['source'         ] = 'infoFruits'
    df['description'    ] = ''
    df['identification' ] = ''
    df['development'    ] = ''
    df['damage'         ] = ''
    df['management'     ] = ''
    
    df.rename(columns = {
        'name'          : 'title',
        'cultural_tips' : 'links'
    }, inplace = True)

    df['links'] = df['links'].apply(lambda d: d if isinstance(d, list) else [])

    df['links'] = df.apply(lambda r: [
        {
            'type'  : 'tip'     , 
            'src'   : i['link'] , 
            'link'  : ''        ,
            'title' : r['title'] + ' - ' + i['tip']
        } for i in r['links'] if len(i['tip']) > 0], axis = 1)

    df['pests_and_disorders'] = df.apply(lambda r: [
        {
            'type'  : 'problem' , 
            'src'   : i['link'] , 
            'link'  : ''        ,
            'title' : r['title'] + ' - ' + i['problem']
        } for i in r['pests_and_disorders'] if len(i['problem']) > 0], axis = 1)
    
    df.apply(lambda x: x['links'].extend(x['pests_and_disorders' ]), axis = 1)

    df = df[cols]
    return df

df      = infoFruits()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def infoVeggies():
    # -------------------------------------------- Veggies information
    print(f'Merging veggies information...')
    FILE_NAME = 'veggieItems_new.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url description tips images pests_and_disorders
    final schema:
    source url title description identification development damage management links
    '''

    df['source'         ] = 'infoVeggies'
    df['identification' ] = ''
    df['development'    ] = ''
    df['damage'         ] = ''
    
    df.rename(columns = {
        'name'  : 'title'       ,
        'tips'  : 'management'  ,
        'images': 'links'       ,
    }, inplace = True)

    df['links'] = df.apply(lambda r: [
        {
            'type'  : 'image'   , 
            'src'   : i['src']  , 
            'link'  : i['link'] ,
            'title' : r['title'] + ' - ' + i['caption']
        } for i in r['links'] if len(i['caption']) > 0], axis = 1)

    df['pests_and_disorders'] = df.apply(lambda r: [
        {
            'type'  : 'problem' , 
            'src'   : i['link'] , 
            'link'  : ''        ,
            'title' : r['title'] + ' - ' + i['problem']
        } for i in r['pests_and_disorders'] if len(i['problem']) > 0], axis = 1)

    df.apply(lambda x: x['links'].extend(x['pests_and_disorders' ]), axis = 1)

    df = df[cols]

    return df

df      = infoVeggies()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def infoFlowers():
    # -------------------------------------------- Flowers information
    print(f'Merging flowers information...')
    FILE_NAME = 'plantFlowerItems.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url identification optimum_conditions pests_and_disorders images
    final schema:
    source url title description identification development damage management links
    '''

    df['source'         ] = 'infoFlowers'
    df['description'    ] = ''
    df['development'    ] = ''
    df['damage'         ] = ''

    df.rename(columns = {
        'name'              : 'title'       ,
        'optimum_conditions': 'management'  ,
        'images'            : 'links'       ,
    }, inplace = True)

    df['links'] = df.apply(lambda r: [
        {
            'type'  : 'image'   , 
            'src'   : i['src']  , 
            'link'  : i['link'] ,
            'title' : r['title'] + ' - ' + i['caption']
        } for i in r['links'] if len(i['caption']) > 0], axis = 1)

    df['pests_and_disorders'] = df.apply(lambda r: [
        {
            'type'  : 'problem' , 
            'src'   : i['link'] , 
            'link'  : ''        ,
            'title' : r['title'] + ' - ' + i['problem']
        } for i in r['pests_and_disorders'] if len(i['problem']) > 0], axis = 1)

    df.apply(lambda x: x['links'].extend(x['pests_and_disorders' ]), axis = 1)

    df = df[cols]

    return df

df      = infoFlowers()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def _clean(text):
    '''
    Fix encodings and remove escape and redundant whitespace characters from text.
    '''
    text = text.encode('ascii', 'ignore').decode()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

colsVector = ['title', 'description', 'identification', 'development', 'damage', 'management']
for c in colsVector:
    finalDf[c] = finalDf[c].apply(_clean)

print(f'Fix encodings and remove escape and redundant whitespace characters from text.')
print(f'------------------------------------------------')

print(f'Final dataframe shape: {finalDf.shape    }')
print(f'FINISHED')

finalDf.sample(5)

Merging pests diseases...
Final dataframe shape: (519, 9)
------------------------------------------------
Merging turf pests...
Final dataframe shape: (558, 9)
------------------------------------------------
Merging exotic pests...
Final dataframe shape: (589, 9)
------------------------------------------------
Merging fruit and veggie damages...
Final dataframe shape: (812, 9)
------------------------------------------------
Merging weed damages...
Final dataframe shape: (986, 9)
------------------------------------------------
Merging fruits information...
Final dataframe shape: (1001, 9)
------------------------------------------------
Merging veggies information...
Final dataframe shape: (1032, 9)
------------------------------------------------
Merging flowers information...
Final dataframe shape: (1218, 9)
------------------------------------------------
Fix encodings and remove escape and redundant whitespace characters from text.
----------------------------------------------

Unnamed: 0,source,url,title,description,identification,development,damage,management,links
11,pestsDiseases,http://ipm.ucanr.edu/PMG/GARDEN/VEGES/PESTS/wi...,Wireworms,"Wireworm larvae are slender, cylindrical insec...",,Common wireworm species require 3 to 4 years t...,Wireworm larvae injure sprouting seeds and see...,"Prior to planting, flooding an area can help r...","[{'type': 'image', 'src': 'http://ipm.ucanr.ed..."
98,pestsDiseases,http://ipm.ucanr.edu/PMG/GARDEN/VEGES/DISEASES...,Bacterial soft rots of onions and garlic,Bacterial soft rots are primarily a problem on...,,,,Free water is essential for entry and spread o...,"[{'type': 'image', 'src': 'http://ipm.ucanr.ed..."
109,pestsDiseases,http://ipm.ucanr.edu/PMG/GARDEN/VEGES/DISEASES...,Fusarium wilt on tomatoes,Plants infected with the Fusarium fungus turn ...,Both Fusarium and Verticillium wilt cause leaf...,Both Fusarium and Verticillium form resistant ...,,Fusarium wilt of tomatoes can be avoided in ma...,"[{'type': 'image', 'src': 'http://ipm.ucanr.ed..."
858,damagesWeed,http://ipm.ucanr.edu/PMG/WEEDS/thymeleaf_speed...,Thymeleaf speedwell,"Thymeleaf speedwell, a perennial broadleaf pla...",,,,,"[{'type': 'image', 'src': 'http://ipm.ucanr.ed..."
1213,infoFlowers,http://ipm.ucanr.edu/PMG/GARDEN/FLOWERS/carnat...,Carnation,,Carnations are perennial plants grown as borde...,,,"Carnations require rich, well-drained soil. Th...","[{'type': 'image', 'src': 'http://ipm.ucanr.ed..."


# IPM data - April 2022 Scrape

In [4]:
'''
['FruitVegCulturalItems.json',
 'GardenControlsPestItems.json',
 'GardenControlsPesticideItems.json',
 'PestNotes.json',
 'QuickTips.json',
 'Videos.json',
 'WeedIdItems.json']
'''
_PATH = '../data/'
DATA_FILE_NAMES = sorted(os.listdir(_PATH))
DATA_FILE_NAMES

['FruitVegCulturalItems.json',
 'GardenControlsPestItems.json',
 'GardenControlsPesticideItems.json',
 'PestNotes.json',
 'QuickTips.json',
 'Videos.json',
 'WeedIdItems.json',
 'exoticPests.json',
 'fruitItems_new.json',
 'fruitVeggieEnvironItems_new.json',
 'pestDiseaseItems_new.json',
 'plantFlowerItems.json',
 'turfPests.json',
 'veggieItems_new.json',
 'weedItems.json']

## ETL of data

In [5]:
cols = ['source', 'url', 'title', 'description', 'identification', 'development', 'damage', 'management', 'links']

def infoFruitVegCultural():
    # -------------------------------------------- Fruit and veggie cultural tips
    print(f'Merging fruit and veggie cultural tips..')
    FILE_NAME = 'FruitVegCulturalItems.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url description images tips_table
    final schema:
    source url title description identification development damage management links
    '''

    df['source'         ] = 'infoFruitVegCultural'        
    df['identification' ] = ''
    df['development'    ] = ''
    df['damage'         ] = ''
    df['management'     ] = ''

    df.rename(columns = {
        'name'  : 'title',
        'images': 'links'
    }, inplace = True)

    df['links'] = df.apply(lambda r: [
        {
            'type'  : 'image'   , 
            'src'   : i['src']  , 
            'link'  : ''        ,
            'title' : r['title'] + ' - ' + i['caption']
        } for i in r['links'] if len(i['caption']) > 0], axis = 1)

    df['tips_table'] = df.apply(lambda r: [
        {
            'type'  : 'problem' , 
            'src'   : r['url'] , 
            'link'  : ''        ,
            'title' : r['title'] + ' - ' + i['header']
        } for i in r['tips_table'] if 'header' in i and len(i['header']) > 0], axis = 1)

    df.apply(lambda x: x['links'].extend(x['tips_table' ]), axis = 1)

    df = df[cols]
    
    return df

df      = infoFruitVegCultural()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def infoPestControl():
    # -------------------------------------------- Garden pest control
    print(f'Merging garden pest control information...')
    FILE_NAME = 'GardenControlsPestItems.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url description images
    final schema:
    source url title description identification development damage management links
    '''

    df['source'         ] = 'infoPestControl'
    df['identification' ] = ''
    df['development'    ] = ''
    df['damage'         ] = ''
    df['management'     ] = ''

    df.rename(columns = {
        'name'  : 'title',
        'images': 'links'
    }, inplace = True)

    df['links'] = df['links'].apply(lambda d: d if isinstance(d, list) else [])

    df['links'] = df.apply(lambda r: [
        {
            'type'  : 'image'                , 
            'src'   : i.get('src'       , ''), 
            'link'  : i.get('link'      , ''),
            'title' : r['title'] + ' - ' + i['caption']
        } for i in r['links'] if len(i['caption']) > 0], axis = 1)

    df = df[cols]
    
    return df

df      = infoPestControl()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def infoPesticideControl():
    # -------------------------------------------- Garden pesticide control
    print(f'Merging garden pesticide control information...')
    FILE_NAME = 'GardenControlsPesticideItems.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    active_ingredient url pesticide_type information
    final schema:
    source url title description identification development damage management links
    '''

    df['source'         ] = 'infoPesticideControl'
    df['identification' ] = ''
    df['development'    ] = ''
    df['damage'         ] = ''
    df['management'     ] = ''
    
    df['title'          ] = df[['active_ingredient', 'pesticide_type']].agg(' - '.join, axis=1)
    df['description'    ] = df['information'].str[0].apply(lambda x: x['associated_pests'])
    df['links'          ] = [[] for _ in range(len(df))]

    df = df[cols]

    return df

df      = infoPesticideControl()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def pestsNotes():
    # -------------------------------------------- Pests IPM
    print(f'Merging pests notes...')
    FILE_NAME = 'PestNotes.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name urlPestNote descriptionPestNote lifecyclePestNote damagePestNote managementPestNote imagePestNote tablePestNote
    final schema:
    source url title description identification development damage management links
    '''

    df['source'         ] = 'pestsNotes'
    df['identification' ] = ''

    df.rename(columns = {
        'urlPestNote'           : 'url'         ,
        'name'                  : 'title'       ,
        'descriptionPestNote'   : 'description' ,
        'lifecyclePestNote'     : 'development' ,
        'damagePestNote'        : 'damage'      ,
        'managementPestNote'    : 'management'  ,
        'imagePestNote'         : 'links'       ,
    }, inplace = True)

    df['links'] = df.apply(lambda r: [
        {
            'type'  : 'image'   , 
            'src'   : i['src']  , 
            'link'  : i['link'] ,
            'title' : r['title'] + ' - ' + i['caption']
        } for i in r['links'] if len(i['caption']) > 0], axis = 1)

    df = df[cols]
    
    return df

df      = pestsNotes()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def pestsQuickTips():
    # -------------------------------------------- Quick tips on pests
    print(f'Merging pests quick notes...')
    FILE_NAME = 'QuickTips.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name urlQuickTip contentQuickTips imageQuickTips
    final schema:
    source url title description identification development damage management links
    '''

    df['source'        ] = 'pestsQuickTips'
    df['identification'] = ''
    df['development'   ] = ''
    df['damage'        ] = ''
    df['management'    ] = ''

    df.rename(columns = {
        'urlQuickTip'           : 'url'         ,
        'name'                  : 'title'       ,
        'contentQuickTips'      : 'description' ,
        'imageQuickTips'        : 'links'       ,
    }, inplace = True)

    df['links'] = df.apply(lambda r: [
        {
            'type'  : 'image'   , 
            'src'   : i['src']  , 
            'link'  : i['link'] ,
            'title' : r['title'] + ' - ' + i['caption']
        } for i in r['links'] if len(i['caption']) > 0], axis = 1)

    df = df[cols]
    
    return df

df      = pestsQuickTips()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')

def pestsVideos():
    # -------------------------------------------- Videos of UC IPM YouTube data
    print(f'Merging UC IPM YouTube data...')
    FILE_NAME = 'Videos.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    title url description
    final schema:
    source url title description identification development damage management links
    '''

    df['source'         ] = 'pestsVideos'
    df['identification' ] = ''
    df['development'    ] = ''
    df['damage'         ] = ''
    df['management'     ] = ''

    df['links'          ] = [[] for _ in range(len(df))]

    df = df[cols]

    return df

df      = pestsVideos()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape    }')
print(f'------------------------------------------------')


def pestsWeed():
    # -------------------------------------------- Weed related pests
    print(f'Merging weed related pests...')
    FILE_NAME = 'WeedIdItems.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url description images
    final schema:
    source url title description identification development damage management links
    '''

    df['source'         ] = 'pestsWeed'
    df['identification' ] = ''
    df['development'    ] = ''
    df['damage'         ] = ''
    df['management'     ] = ''

    df.rename(columns = {
        'name'  : 'title',
        'images': 'links'
    }, inplace = True)

    df['links'] = df.apply(lambda r: [
        {
            'type'  : 'image'   , 
            'src'   : i['src']  , 
            'link'  : ''        ,
            'title' : r['title'] + ' - ' + i['caption']
        } for i in r['links'] if len(i['caption']) > 0], axis = 1)

    df = df[cols]

    return df

df      = pestsWeed()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')

def _clean(text):
    '''
    Fix encodings and remove escape and redundant whitespace characters from text.
    '''
    text = text.encode('ascii', 'ignore').decode()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

colsVector = ['title', 'description', 'identification', 'development', 'damage', 'management']
for c in colsVector:
    finalDf[c] = finalDf[c].apply(_clean)

print(f'Fix encodings and remove escape and redundant whitespace characters from text.')
print(f'------------------------------------------------')

print(f'Final dataframe shape: {finalDf.shape    }')
print(f'FINISHED')

finalDf.sample(5)

Merging fruit and veggie cultural tips..
Final dataframe shape: (1347, 9)
------------------------------------------------
Merging garden pest control information...
Final dataframe shape: (1367, 9)
------------------------------------------------
Merging garden pesticide control information...
Final dataframe shape: (1386, 9)
------------------------------------------------
Merging pests notes...
Final dataframe shape: (1558, 9)
------------------------------------------------
Merging pests quick notes...
Final dataframe shape: (1612, 9)
------------------------------------------------
Merging UC IPM YouTube data...
Final dataframe shape: (1670, 9)
------------------------------------------------
Merging weed related pests...
Final dataframe shape: (1698, 9)
------------------------------------------------
Fix encodings and remove escape and redundant whitespace characters from text.
------------------------------------------------
Final dataframe shape: (1698, 9)
FINISHED


Unnamed: 0,source,url,title,description,identification,development,damage,management,links
349,pestsDiseases,http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/PESTS/so...,Pillbugs and sowbugs,Pillbugs (family Armadillididae) and sowbugs (...,,,,,"[{'type': 'image', 'src': 'http://ipm.ucanr.ed..."
1020,infoVeggies,http://ipm.ucanr.edu/home-and-landscape/potato...,Potato,The edible and harvested parts of a potato pla...,,,,Cultural practices such as proper site selecti...,"[{'type': 'image', 'src': 'http://ipm.ucanr.ed..."
871,damagesWeed,http://ipm.ucanr.edu/PMG/WEEDS/italian_ryegras...,Italian ryegrass,"Italian ryegrass, also called annual ryegrass,...",,,,,"[{'type': 'image', 'src': 'http://ipm.ucanr.ed..."
585,pestsExotic,https://www2.ipm.ucanr.edu/Invasive-and-Exotic...,Exotic Newcastle Disease,,,,,,"[{'type': 'page', 'src': 'http://ipm.ucanr.edu..."
1311,infoFruitVegCultural,http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/CULTURAL...,Harvesting and storing apples,Harvest apples when the fruit is at full matur...,,,,,"[{'type': 'image', 'src': 'http://ipm.ucanr.ed..."


# AskExtension Data

In [6]:
import json

PATH = '../ask-data/'
FILE_NAMES = [PATH + f for f in sorted(os.listdir(PATH))]

with open(FILE_NAMES[0], 'r', encoding='utf-8') as f:
    f = json.load(f)
    print(json.dumps(f[0], indent = 2))

{
  "faq-id": 109900,
  "title": "When can I plant blue spruce trees in Colorado? #109900",
  "created": "2012-12-03 15:53:47",
  "updated": "2014-09-16 18:32:47",
  "state": "Colorado",
  "county": "El Paso County",
  "tags": [
    "trees and shrubs"
  ],
  "question": "I need to plant two blue spruce trees that are currently in 24\" diameter plastic containers with drain holes in the bottom sides.\n\nLocation: northeast side of Colorado Springs.\n\nThese trees are currently outside on the patio and susceptible to the wind and sun. The trees were watered this past Saturday and seem to be healthy.\n\nQuestion: Can these trees be planted now? Currently the soil is not frozen and night time temps are 35 to 40 degrees.\n\nI have downloaded and read CMG GardenNotes #633 as a reference.\n\nAny advice would be greatly appreciated. ",
  "answer": {
    "1": {
      "response": "Jerry, \nyou can plant them now (a) OR temporarily \"plant\" them, still in containers, so that roots have some insu

## ETL

In [7]:
def transform_to_dict(answers):
    new_dict = {}
    for i, a in enumerate(answers):
        if 'response' in a:
            new_dict[str(i)] = a 
    return new_dict

answers = [
      {
        "attachments": [
          "https://ask2.extension.org/file.php?key=pxxbzq8tkf2sywrzebtn62pwm7--tgs0&expires=1667520000&signature=4acc1145a20ddd4dff9202e602f574fd827ec85a"
        ]
      },
      {
        "response": "Thank you for you question.  I do not believe the ants are doing the damage.  If anything, they are taking advantage of the debris from the tree.  I believe that the publication link below is the best diagnosis given the information you have provided. In regards to the oozing, Bacterial Wetwood would be the culprit.  I am including another link for your review. Thank you for the question and information to help identify the problem.http://texasforestservice.tamu.edu/main/popup.aspx?id=1262http://harris-agrilife-org.wpengine.netdna-cdn.com/files/2011/05/Bacterial-Wetwood.pdf",
        "author": "Michael Potter"
      }
    ]
transform_to_dict(answers)

{'1': {'response': 'Thank you for you question.\xa0 I do not believe the ants are doing the damage.\xa0 If anything, they are taking advantage of the debris from the tree.\xa0 I believe that the publication link below is the best diagnosis given the information you have provided. In regards to the oozing, Bacterial Wetwood would be the culprit.\xa0 I am including another link for your review. Thank you for the question and information to help identify the problem.http://texasforestservice.tamu.edu/main/popup.aspx?id=1262http://harris-agrilife-org.wpengine.netdna-cdn.com/files/2011/05/Bacterial-Wetwood.pdf',
  'author': 'Michael Potter'}}

In [8]:
import sys
import re

from string import punctuation as pn

# Modify STATE_FILTER and MIN_WORD_COUNT variables accordingly
STATE_FILTER    = ['California']
MIN_WORD_COUNT  = 3

ASKEXTENSION_QUESTION_URL = 'https://ask2.extension.org/kb/faq.php?id='

# Combines the data files into one and returns it.
df = pd.DataFrame()
for f in FILE_NAMES:
    df = pd.concat([df, pd.read_json(f)], ignore_index = True, axis = 0)

df['source'] = 'askExtension'

# Convert 'faq-id' to str type
df['faq-id'] = df['faq-id'].astype(str)

# Leave tickets from California state
df = df[df['state'].isin(STATE_FILTER)]

# Add the URL and leave blank URL for questions with no ID
df['url'] = [
    f"{ASKEXTENSION_QUESTION_URL}{ticket_no}" if len(ticket_no) == 6 else ""
    for ticket_no in df['title'].str.split('#').str[-1]
]

# Add the ticket number from title and leave blank for questions without
df['ticket-no'] = [
    ticket_no if len(ticket_no) == 6 else ""
    for ticket_no in df['title'].str.split('#').str[-1]
]

df.rename(columns = {'faq-id': 'faq_id', 'ticket-no': 'ticket_no'}, inplace = True)

def _clean(text):
    '''
    Fix encodings and remove escape and redundant whitespace characters from text.

    Examples with non-ascii characters - 110358, 147160
    Examples with redundant whitespace - 117069, 127760

    See: https://stackoverflow.com/a/53821967/5480536
    '''
    text = text.encode('ascii', 'ignore').decode()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def transform_to_dict(answers):
    new_dict = {}
    for i, a in enumerate(answers):
        if 'response' in a:
            new_dict[str(i)] = a 
    return new_dict

def _transform_answer(answer_dict):
    '''
    Convert answer field from a dictionary to a list.
    '''
    if isinstance(answer_dict, list):
        answer_dict = transform_to_dict(answer_dict)
    answers = [{}] * len(answer_dict)
    
    for k, v in answer_dict.items():
        # clean the response up
        v = {
            'type'  : 'answer'  ,
            'src'   : ''        , 
            'link'  : ''        ,
            'title' : _clean(v['response']),
        }
        answers[int(k) - 1] = v
    
    return answers

# Transform answer for consistency with IPM data
df['links'] = df['answer'].apply(_transform_answer)
df['links'] = df.apply(lambda r: [
        {
            'type'  : i['type'] , 
            'src'   : r['url']  , 
            'link'  : ''        ,
            'title' : i['title']
        } for i in r['links'] if len(i['title']) > 0], axis = 1)

# Strip all spaces and remove non-ascii characters from text fields
for column in ['state', 'title', 'question']:
    df[column] = df[column].apply(_clean)

def _transform_title(title):
    '''
    Remove question ID from title, and append '.' in the end
    if no punctuation was detected.

    Example with '#' - 437259
    Example with '...' - 437264
    '''
    title = ''.join(title.split('#')[:-1]).strip().strip('...')
    
    # add a '.' if it does not yet end with a punctuation
    title = title if (title and title[-1] in pn) else title + '.'
    
    return title

# Clean ID and '...' from title, and append punctuation if not present
df['title'] = df['title'].apply(_transform_title)

def _merge_title_question(df):
    '''
    Create new column from questions and title,
    but only if it is not already exactly in the question.
    '''
    titles      = df['title'    ].tolist()
    questions   = df['question' ].tolist()
    
    tqs = [
        question
        if (title and question.startswith(title[:-1]))
        else title + " " + question
        for (title, question) in zip(titles, questions)
    ]

    return tqs

# Create new column from `title` and `question`, or only question
# if title is exactly the question     
df['description'] = _merge_title_question(df)
    
# Remove questions with small number words in title-question
if MIN_WORD_COUNT:
    df = df[df['description'].str.split().str.len() > MIN_WORD_COUNT]

df = df.loc[:, ['source', 'url', 'title', 'description', 'links']]
df.sample(5)

cols = ['source', 'url', 'title', 'description', 'identification', 'development', 'damage', 'management', 'links']
'''
columns in source:
source url name description links
final schema:
source url title description identification development damage management links    
'''
df['identification' ] = ''
df['development'    ] = ''
df['damage'         ] = ''
df['management'     ] = ''
df = df[cols]

finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')

finalDf.sample(5)

Final dataframe shape: (4395, 9)
------------------------------------------------


Unnamed: 0,source,url,title,description,identification,development,damage,management,links
1137,infoFlowers,http://ipm.ucanr.edu/PMG/GARDEN/PLANTS/forsyth...,Forsythia,,Forsythia,,,Forsythia,"[{'type': 'image', 'src': 'http://ipm.ucanr.ed..."
4381,askExtension,https://ask2.extension.org/kb/faq.php?id=806610,Copyright permission.,Copyright permission. I am a member of the San...,,,,,[]
3432,askExtension,https://ask2.extension.org/kb/faq.php?id=442232,Renting/Leasing Farm Land.,"Renting/Leasing Farm Land. Hello, I have 160 a...",,,,,"[{'type': 'answer', 'src': 'https://ask2.exten..."
2831,askExtension,https://ask2.extension.org/kb/faq.php?id=337971,Safe or Dangerous?,Safe or Dangerous? Need help.. is this a safe ...,,,,,"[{'type': 'answer', 'src': 'https://ask2.exten..."
2178,askExtension,https://ask2.extension.org/kb/faq.php?id=248383,infestations of insects in container gardens.,infestations of insects in container gardens. ...,,,,,"[{'type': 'answer', 'src': 'https://ask2.exten..."


# Embedding text fields into vectors and stripping text fields for saving into ES

In [9]:
import sys

sys.path.insert(1, os.path.realpath(os.path.pardir))

os.environ['STAGE'          ] = 'dev'
os.environ['ES_USERNAME'    ] = 'elastic'
os.environ['ES_PASSWORD'    ] = 'changeme'
os.environ['TF_CACHE_DIR'   ] = '/var/tmp/models'
## select the environment for ingestion
os.environ['ES_HOST'    ] = 'https://qa.ucipm.es.chat.ask.eduworks.com/'
# os.environ['ES_HOST'    ] = 'https://dev.es.chat.ask.eduworks.com/'
# os.environ['ES_HOST'    ] = 'https://qa.es.chat.ask.eduworks.com/'

import config

  from .autonotebook import tqdm as notebook_tqdm


INFO:config:----------------------------------------------
INFO:config:Loading synonym procedure
INFO:config:Successfully loaded synonym list
INFO:config:----------------------------------------------
INFO:config:----------------------------------------------
INFO:config:Loading hardcoded queries
INFO:config:Successfully loaded hardcoded queries
INFO:config:- cut off parameter for hardcoded queries     = 0.60
INFO:config:- cut off parameter for similarity threshold  = 0.85
INFO:config:----------------------------------------------
INFO:config:----------------------------------------------
INFO:config:Configuration variables for DEV environment
INFO:config:- stage           = dev
INFO:config:- expert_url      = https://ucanr.edu/About/Locations/
INFO:config:- es_search_size  = 100
INFO:config:- es_cut_off      = 0.4
INFO:config:- es_top_n        = 10
INFO:config:- es_ask_weight   = 0.8
INFO:config:----------------------------------------------
INFO:config:-------------------------------

In [10]:
import importlib
importlib.reload(config)

INFO:config:----------------------------------------------
INFO:config:Loading synonym procedure
INFO:config:Successfully loaded synonym list
INFO:config:----------------------------------------------
INFO:config:----------------------------------------------
INFO:config:Loading hardcoded queries
INFO:config:Successfully loaded hardcoded queries
INFO:config:- cut off parameter for hardcoded queries     = 0.60
INFO:config:- cut off parameter for similarity threshold  = 0.85
INFO:config:----------------------------------------------
INFO:config:----------------------------------------------
INFO:config:Configuration variables for DEV environment
INFO:config:- stage           = dev
INFO:config:- expert_url      = https://ucanr.edu/About/Locations/
INFO:config:- es_search_size  = 100
INFO:config:- es_cut_off      = 0.4
INFO:config:- es_top_n        = 10
INFO:config:- es_ask_weight   = 0.8
INFO:config:----------------------------------------------
INFO:config:-------------------------------

<module 'config' from 'C:\\Users\\imjef\\Documents\\Eduworks\\Ask\\Chatbot\\test_delete_later\\askchatbot\\actions\\es\\config.py'>

### Transforming textual data

In [11]:
CHUNK_SIZE      = 1
ROLLING_SIZE    = 3

colsVector = ['title', 'description', 'identification', 'development', 'damage', 'management']
print(f'Final DF: Transforming columns - {colsVector} and links titles.')

from spacy.lang.en import English 

nlp = English()
nlp.add_pipe('sentencizer')
raw_text = finalDf.iloc[0]['description']

print(f'STARTING TRANSFORMING')
c_items = []
for i, r in finalDf.iterrows():
    r_texts = []
    for c in colsVector:
        t = r[c]
        
        doc = nlp(t)
        
        ts = [sent for sent in doc.sents]
        if len(ts) == 0:
            continue
        else:
            chunks, chunk_size, roll_size = len(ts), CHUNK_SIZE, ROLLING_SIZE
            ts = [ts[i1:i1+chunk_size+(roll_size - 1)] for i1 in range(0, chunks - (roll_size - 1), chunk_size)]
            ts = [{'text': ' '.join([l2.text for l2 in l1]), 'name': c + '_' + str(i1), 'start': l1[0].start_char, 'end': l1[-1].end_char} for i1, l1 in enumerate(ts)]
                
        r_texts.extend(ts)
    
    ts = [r['title'] + ' - ' + i1['title'] for i1 in r['links']]
    if len(ts) == 0:
        c_items.append(r_texts)
        if (i+1) % 500 == 0:
            print(f'Finished transforming of {i+1} rows of dataframe')
        continue

    for i1, v in enumerate(ts):
        r_texts.append({'text': v, 'name': 'links_' + str(i1), 'start': 0, 'end': -1})
    
    c_items.append(r_texts)

    if (i+1) % 500 == 0:
        print(f'Finished transforming of {i+1} rows of dataframe')

print(f'Finished transforming of {i+1} rows of dataframe')
print(f'FINISHED TRANSFORMING')

Final DF: Transforming columns - ['title', 'description', 'identification', 'development', 'damage', 'management'] and links titles.
STARTING TRANSFORMING
Finished transforming of 500 rows of dataframe
Finished transforming of 1000 rows of dataframe
Finished transforming of 1500 rows of dataframe
Finished transforming of 2000 rows of dataframe
Finished transforming of 2500 rows of dataframe
Finished transforming of 3000 rows of dataframe
Finished transforming of 3500 rows of dataframe
Finished transforming of 4000 rows of dataframe
Finished transforming of 4395 rows of dataframe
FINISHED TRANSFORMING


### Replacing synonyms

In [12]:
print(f'START REPLACING')

texts = [r1['text'] for r in c_items for r1 in r]
texts_modified = []

for i, text in enumerate(texts):
    tokens = config.tokenizer(text)
    modified = ""
    replace = False
    for token in tokens:
        t = token.text.lower()
        
        if t in config.synonym_dict:
            modified += config.synonym_dict[t]
            modified += token.whitespace_
            replace = True
        else:
            modified += token.text_with_ws
    
    if not replace:
        modified = text
    texts_modified.append(modified)

    if (i+1) % 10000 == 0:
        print(f'Finished replacing synonyms of {i+1} items of sentences')

print(f'Finished replacing synonyms of {i+1} items of sentences')
print(f'FINISHED REPLACING')

START REPLACING
Finished replacing synonyms of 10000 items of sentences
Finished replacing synonyms of 20000 items of sentences
Finished replacing synonyms of 30000 items of sentences
Finished replacing synonyms of 40000 items of sentences
Finished replacing synonyms of 50000 items of sentences
Finished replacing synonyms of 55539 items of sentences
FINISHED REPLACING


### Checking for invalid links

In [13]:
count_ucipm = 0
count_askextension = 0
for i, r in finalDf.iterrows():
    if len(r['url']) < 10:
        print(f'Source with no main link at row {i} of data frame, main link - {r["url"]}.')
    links = r['links']
    no_link = False
    show_main_url = False
    for l in links:
        url = r['url']
        if len(l['src']) < 10:
            no_link = True
            if not show_main_url:
                show_main_url = True
                print(f'Links at {url}')
            print(l)
    if no_link:
        if r['source'] == 'askExtension':
            count_askextension += 1
        else:
            count_ucipm += 1

print(f'Number of sources from AskExtension with no link urls - {count_askextension}')
print(f'Number of sources from UC IPM with no link urls - {count_ucipm}')

Links at http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/walnut.html?src=exchbt
{'type': 'problem', 'src': '', 'link': '', 'title': 'Walnuts - Dusky-veined walnut aphid'}
{'type': 'problem', 'src': '', 'link': '', 'title': 'Walnuts - Walnut aphid'}
{'type': 'problem', 'src': '', 'link': '', 'title': 'Walnuts - San Jose scale'}
{'type': 'problem', 'src': '', 'link': '', 'title': 'Walnuts - Walnut scale'}
{'type': 'problem', 'src': '', 'link': '', 'title': 'Walnuts - Bark beetles and borers'}
{'type': 'problem', 'src': '', 'link': '', 'title': 'Walnuts - Mites'}
{'type': 'problem', 'src': '', 'link': '', 'title': 'Walnuts - European fruit lecanium'}
{'type': 'problem', 'src': '', 'link': '', 'title': 'Walnuts - Frosted scale'}
Links at http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/citrus.html?src=exchbt
{'type': 'problem', 'src': '', 'link': '', 'title': 'Citrus - California red scale'}
{'type': 'problem', 'src': '', 'link': '', 'title': 'Citrus - Purple scale'}
{'type': 'problem', 'src': '', 'link': '', 

### Embedding textual data

In [14]:
BATCH_SIZE = 64

print(f'STARTING EMBEDDING')

finalDf['vectors'] = np.empty((len(finalDf), 0)).tolist()

# TF HUB model
# vectors   = config.embed(texts_modified).numpy().tolist()
    
# Sentence Encoder model        
vectors = config.embed.encode(
    sentences           = texts_modified,
    batch_size          = BATCH_SIZE    ,
    show_progress_bar   = True
).tolist()

index = 0
for i, r in enumerate(c_items):
    for i1, r1 in enumerate(r):
        r1['vector'] = vectors[index]
        r1.pop('text')
        index += 1

print(f'FINISHED EMBEDDING')

finalDf['vectors'] = c_items
print(f'The number of vectors to be ingested: {len([r1["vector"] for r in finalDf["vectors"] for r1 in r])}')        
finalDf.sample(5)

STARTING EMBEDDING


Batches: 100%|██████████| 868/868 [01:08<00:00, 12.72it/s]


FINISHED EMBEDDING
The number of vectors to be ingested: 55539


Unnamed: 0,source,url,title,description,identification,development,damage,management,links,vectors
1095,infoFlowers,http://ipm.ucanr.edu/PMG/GARDEN/PLANTS/androme...,Andromeda,,Plant identification Andromeda is an attractiv...,,,Andromeda plants are wonderful for container p...,"[{'type': 'image', 'src': 'http://ipm.ucanr.ed...","[{'name': 'identification_0', 'start': 0, 'end..."
3084,askExtension,https://ask2.extension.org/kb/faq.php?id=364408,Cocoyu.,"Cocoyu. It's that ok, if I spray turflon to ki...",,,,,"[{'type': 'answer', 'src': 'https://ask2.exten...","[{'name': 'description_0', 'start': 0, 'end': ..."
3320,askExtension,https://ask2.extension.org/kb/faq.php?id=416853,Ammonium phosphate and organic gardening.,Ammonium phosphate and organic gardening. Hell...,,,,,"[{'type': 'answer', 'src': 'https://ask2.exten...","[{'name': 'description_0', 'start': 0, 'end': ..."
3840,askExtension,https://ask2.extension.org/kb/faq.php?id=612511,Plant Name.,"Plant Name. Hi,Please identity the plant with ...",,,,,"[{'type': 'answer', 'src': 'https://ask2.exten...","[{'name': 'description_0', 'start': 0, 'end': ..."
127,pestsDiseases,http://ipm.ucanr.edu/PMG/GARDEN/VEGES/DISEASES...,Cavity spot,Cavity spot is a problem on carrots and is cha...,Symptoms of cavity spot on roots may sometimes...,,,The fungus that causes cavity spot is favored ...,"[{'type': 'image', 'src': 'http://ipm.ucanr.ed...","[{'name': 'description_0', 'start': 0, 'end': ..."


## Ingesting data into ES

In [15]:
# Different embedding sizes depending on the models
# VECTOR_SIZE = 384
# VECTOR_SIZE = 512
VECTOR_SIZE = 768

mapping  = {
    "settings": {"number_of_shards": 2, "number_of_replicas": 1},
    "mappings": {
        "dynamic"   : "false",
        "_source"   : {"enabled": "true"},
        "properties": {
            "source"        : {"type": "keyword", "index": "true" , "ignore_above": 32766},
            "url"           : {"type": "keyword", "index": "false", "ignore_above": 32766},

            "title"         : {"type": "keyword", "index": "false", "ignore_above": 32766},
            "description"   : {"type": "keyword", "index": "false", "ignore_above": 32766},
            "identification": {"type": "keyword", "index": "false", "ignore_above": 32766},
            "development"   : {"type": "keyword", "index": "false", "ignore_above": 32766},
            "damage"        : {"type": "keyword", "index": "false", "ignore_above": 32766},
            "management"    : {"type": "keyword", "index": "false", "ignore_above": 32766},
            "vectors"       : {
                "type"      : "nested",
                "properties": {
                    "vector": {
                        "type": "dense_vector", 
                        "dims": VECTOR_SIZE
                    },
                    "name"  : {"type": "keyword", "index": "false", "ignore_above": 32766},
                    "start" : {"type": "integer"                                         },
                    "end"   : {"type": "integer"                                         },
                }
            },
            
            "links"         : {
                "type"      : "nested",
                "properties": {
                    "type"  : {"type": "keyword", "index": "false", "ignore_above": 32766},
                    "src"   : {"type": "keyword", "index": "false", "ignore_above": 32766},
                    "link"  : {"type": "keyword", "index": "false", "ignore_above": 32766},
                    "title" : {"type": "keyword", "index": "false", "ignore_above": 32766}
                }
            }
        }
    }
}

final_json = finalDf.to_dict('records')

In [16]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk

from collections import deque

# increase the timeout if necessary
es_client = Elasticsearch([config.es_host], http_auth=(config.es_username, config.es_password), timeout = 20)

es_client.indices.delete(
    index   = config.es_combined_index, 
    ignore  = 404)
es_client.indices.create(
    index       = config.es_combined_index  , 
    settings    = mapping['settings']       , 
    mappings    = mapping['mappings']       )
# play with chunk size parameter for timed out problem
deque(parallel_bulk(es_client, actions = final_json, index = config.es_combined_index, max_chunk_bytes = 5 * 1024 * 1024), maxlen = 0)

es_client.indices.refresh()

INFO:elasticsearch:GET https://qa.ucipm.es.chat.ask.eduworks.com:443/ [status:200 request:0.191s]
INFO:elasticsearch:DELETE https://qa.ucipm.es.chat.ask.eduworks.com:443/combined [status:200 request:0.260s]
INFO:elasticsearch:PUT https://qa.ucipm.es.chat.ask.eduworks.com:443/combined [status:200 request:0.206s]
INFO:elasticsearch:POST https://qa.ucipm.es.chat.ask.eduworks.com:443/combined/_bulk [status:200 request:0.959s]
INFO:elasticsearch:POST https://qa.ucipm.es.chat.ask.eduworks.com:443/combined/_bulk [status:200 request:1.283s]
INFO:elasticsearch:POST https://qa.ucipm.es.chat.ask.eduworks.com:443/combined/_bulk [status:200 request:1.557s]
INFO:elasticsearch:POST https://qa.ucipm.es.chat.ask.eduworks.com:443/combined/_bulk [status:200 request:1.186s]
INFO:elasticsearch:POST https://qa.ucipm.es.chat.ask.eduworks.com:443/combined/_bulk [status:200 request:1.092s]
INFO:elasticsearch:POST https://qa.ucipm.es.chat.ask.eduworks.com:443/combined/_bulk [status:200 request:1.035s]
INFO:elas

{'_shards': {'total': 8, 'successful': 4, 'failed': 0}}