# EDA and ETL for scraped data from IPM and AskExtension data knowledge

In [1]:
import os
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

COLOR = 'white'
plt.rcParams['text.color'       ] = COLOR
plt.rcParams['text.color'       ] = COLOR
plt.rcParams['axes.labelcolor'  ] = COLOR
plt.rcParams['xtick.color'      ] = COLOR
plt.rcParams['ytick.color'      ] = COLOR

In [2]:
import pandas as pd 

df = pd.read_excel('ucipm_resources.xlsx')
df

Unnamed: 0,title,url,filename,InsIde/OutsIde/Both
0,Apple Frost injury,http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/ENVIRON/...,fruitVeggieEnvironItems_new.json,O
1,Frost injury,http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/ENVIRON/...,fruitVeggieEnvironItems_new.json,O
2,Sunburn,http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/ENVIRON/...,fruitVeggieEnvironItems_new.json,O
3,Fertilizing,http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/ENVIRON/...,fruitVeggieEnvironItems_new.json,O
4,Training,http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/ENVIRON/...,fruitVeggieEnvironItems_new.json,O
...,...,...,...,...
1721,Spiders,https://www2.ipm.ucanr.edu/natural-enemies/spi...,ucipm-natural-enemies.json,O
1722,"Syrphids (flower flies, or hover flies)",https://www2.ipm.ucanr.edu/natural-enemies/syr...,ucipm-natural-enemies.json,O
1723,Twicestabbed lady beetle,https://www2.ipm.ucanr.edu/natural-enemies/twi...,ucipm-natural-enemies.json,O
1724,Vedalia,https://www2.ipm.ucanr.edu/natural-enemies/ved...,ucipm-natural-enemies.json,O


# UC IPM Data

In [3]:
'''
['exoticPests.json',
 'fruitItems_new.json',
 'fruitVeggieEnvironItems_new.json',
 'pestDiseaseItems_new.json',
 'plantFlowerItems.json',
 'turfPests.json',
 'veggieItems_new.json',
 'weedItems.json']
'''
_PATH = './uc_ipm_old/'
DATA_FILE_NAMES = sorted(os.listdir(_PATH))
DATA_FILE_NAMES

['FruitVegCulturalItems.json',
 'GardenControlsPestItems.json',
 'GardenControlsPesticideItems.json',
 'PestNotes.json',
 'QuickTips.json',
 'Videos.json',
 'WeedIdItems.json',
 'exoticPests.json',
 'fruitItems_new.json',
 'fruitVeggieEnvironItems_new.json',
 'naturalEnemies.json',
 'pestDiseaseItems_new.json',
 'plantFlowerItems.json',
 'turfPests.json',
 'veggieItems_new.json',
 'weedItems.json']

## ETL of data

In [4]:
finalDf = pd.DataFrame()
cols = ['source', 'url', 'title', 'description', 'identification', 'development', 'damage', 'management', 'links']
def natural_enemies():
    # -------------------------------------------- Pests diseases
    print(f'Merging pests diseases...')
    FILE_NAME = 'naturalEnemies.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url description identification life_cycle damage solutions images
    final schema:
    source url title description identification development damage management links
    '''

    df['source'] = 'naturalEnemies'
    df.rename(columns = {
        'name'          : 'title'       ,
        'life_cycle'    : 'development' ,
        'damagePestNote': 'damage'      ,
        'solutions'     : 'management'  ,
        'images'        : 'links'       ,
    }, inplace = True)
    df['links'] = df.apply(lambda r: [
        {
            'type'  : 'image'   , 
            'src'   : i['src']  , 
            'link'  : i['link'] ,
            'title' : i['caption']
        } for i in r['links'] if len(i['caption']) > 0], axis = 1)

    df = df[cols]

    return df

df      = natural_enemies()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')

def pestsDiseases():
    # -------------------------------------------- Pests diseases
    print(f'Merging pests diseases...')
    FILE_NAME = 'pestDiseaseItems_new.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url description identification life_cycle damage solutions images
    final schema:
    source url title description identification development damage management links
    '''

    df['source'] = 'pestsDiseases'

    df.rename(columns = {
        'name'          : 'title'       ,
        'life_cycle'    : 'development' ,
        'damagePestNote': 'damage'      ,
        'solutions'     : 'management'  ,
        'images'        : 'links'       ,
    }, inplace = True)


    df['links'] = df.apply(lambda r: [
        {
            'type'  : 'image'   , 
            'src'   : i['src']  , 
            'link'  : i['link'] ,
            'title' : r['title'] + ' - ' + i['caption']
        } for i in r['links'] if len(i['caption']) > 0], axis = 1)

    df = df[cols]

    return df

df      = pestsDiseases()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def pestsTurf():
    # -------------------------------------------- Turf pests
    print(f'Merging turf pests...')
    FILE_NAME = 'turfPests.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url text images
    final schema:
    source url title description identification development damage management links
    '''

    df['source'         ] = 'pestsTurf'
    df['identification' ] = ''
    df['development'    ] = ''
    df['damage'         ] = ''
    df['management'     ] = ''

    df.rename(columns = {
        'name'  : 'title'       ,
        'text'  : 'description' ,
        'images': 'links'       ,
    }, inplace = True)


    df['links'] = df.apply(lambda r: [
        {
            'type'  : 'image'   , 
            'src'   : i['src']  , 
            'link'  : i['link'] ,
            'title' : r['title'] + ' - ' + i['caption']
        } for i in r['links'] if len(i['caption']) > 0], axis = 1)

    df = df[cols]

    return df

df      = pestsTurf()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def pestsExotic():
    # -------------------------------------------- Exotic pests
    print(f'Merging exotic pests...')
    FILE_NAME = 'exoticPests.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url description damage identification life_cycle monitoring management related_links images
    final schema:
    source url title description identification development damage management links
    '''

    df['source'] = 'pestsExotic'

    df.rename(columns = {
        'name'      : 'title'       ,
        'life_cycle': 'development' ,
        'images'    : 'links'       ,
    }, inplace = True)

    df['links'] = df['links'].apply(lambda d: d if isinstance(d, list) else [])

    df['links'] = df.apply(lambda r: [
        {
            'type'  : 'image'   , 
            'src'   : i['src']  , 
            'link'  : i['link'] ,
            'title' : r['title'] + ' - ' + i['caption']
        } for i in r['links'] if len(i['caption']) > 0], axis = 1)
    
    df['related_links'] = df.apply(lambda r: [
        {
            'type'  : 'page'   , 
            'src'   : i['link'] , 
            'link'  : ''        ,
            'title' : r['title'] + ' - ' + i['text']
        } for i in r['related_links'] if len(i['text']) > 0], axis = 1)

    df.apply(lambda x: x['links'].extend(x['related_links' ]), axis = 1)

    df = df[cols]

    return df

df      = pestsExotic()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def damagesEnvironment():
    # -------------------------------------------- Fruit and veggie damages
    print(f'Merging fruit and veggie damages...')
    FILE_NAME = 'fruitVeggieEnvironItems_new.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url description identification damage disorder_development solutions images
    final schema:
    source url title description identification development damage management links
    '''

    df['source'] = 'damagesEnvironment'

    df.rename(columns = {
        'name'                  : 'title'       ,
        'disorder_development'  : 'development' ,
        'solutions'             : 'management' ,
        'images'                : 'links'      ,
    }, inplace = True)

    df['links'] = df.apply(lambda r: [
        {
            'type'  : 'image'   , 
            'src'   : i['src']  , 
            'link'  : i['link'] ,
            'title' : r['title'] + ' - ' + i['caption']
        } for i in r['links'] if len(i['caption']) > 0], axis = 1)

    df = df[cols]

    return df

df      = damagesEnvironment()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def damagesWeed():
    # -------------------------------------------- Weed damages
    print(f'Merging weed damages...')
    FILE_NAME = 'weedItems.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url description images
    final schema:
    source url title description identification development damage management links
    '''

    df['source'         ] = 'damagesWeed'
    df['identification' ] = ''
    df['development'    ] = ''
    df['damage'         ] = ''
    df['management'     ] = ''

    df.rename(columns = {
        'name'  : 'title',
        'images': 'links'
    }, inplace = True)

    df['links'] = df.apply(lambda r: [
        {
            'type'  : 'image'   , 
            'src'   : i['link'] , 
            'link'  : ''        ,
            'title' : r['title'] + ' - ' + i['caption']
        } for i in r['links'] if len(i['caption']) > 0], axis = 1)

    df = df[cols]

    return df

df      = damagesWeed()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def infoFruits():
    # -------------------------------------------- Fruits information
    print(f'Merging fruits information...')
    FILE_NAME = 'fruitItems_new.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url cultural_tips pests_and_disorders
    final schema:
    source url title description identification development damage management links
    '''

    df['source'         ] = 'infoFruits'
    df['description'    ] = ''
    df['identification' ] = ''
    df['development'    ] = ''
    df['damage'         ] = ''
    df['management'     ] = ''
    
    df.rename(columns = {
        'name'          : 'title',
        'cultural_tips' : 'links'
    }, inplace = True)

    df['links'] = df['links'].apply(lambda d: d if isinstance(d, list) else [])

    df['links'] = df.apply(lambda r: [
        {
            'type'  : 'tip'     , 
            'src'   : i['link'] , 
            'link'  : ''        ,
            'title' : r['title'] + ' - ' + i['tip']
        } for i in r['links'] if len(i['tip']) > 0], axis = 1)

    df['pests_and_disorders'] = df.apply(lambda r: [
        {
            'type'  : 'problem' , 
            'src'   : i['link'] , 
            'link'  : ''        ,
            'title' : r['title'] + ' - ' + i['problem']
        } for i in r['pests_and_disorders'] if len(i['problem']) > 0], axis = 1)
    
    df.apply(lambda x: x['links'].extend(x['pests_and_disorders' ]), axis = 1)

    df = df[cols]
    return df

df      = infoFruits()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def infoVeggies():
    # -------------------------------------------- Veggies information
    print(f'Merging veggies information...')
    FILE_NAME = 'veggieItems_new.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url description tips images pests_and_disorders
    final schema:
    source url title description identification development damage management links
    '''

    df['source'         ] = 'infoVeggies'
    df['identification' ] = ''
    df['development'    ] = ''
    df['damage'         ] = ''
    
    df.rename(columns = {
        'name'  : 'title'       ,
        'tips'  : 'management'  ,
        'images': 'links'       ,
    }, inplace = True)

    df['links'] = df.apply(lambda r: [
        {
            'type'  : 'image'   , 
            'src'   : i['src']  , 
            'link'  : i['link'] ,
            'title' : r['title'] + ' - ' + i['caption']
        } for i in r['links'] if len(i['caption']) > 0], axis = 1)

    df['pests_and_disorders'] = df.apply(lambda r: [
        {
            'type'  : 'problem' , 
            'src'   : i['link'] , 
            'link'  : ''        ,
            'title' : r['title'] + ' - ' + i['problem']
        } for i in r['pests_and_disorders'] if len(i['problem']) > 0], axis = 1)

    df.apply(lambda x: x['links'].extend(x['pests_and_disorders' ]), axis = 1)

    df = df[cols]

    return df

df      = infoVeggies()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def infoFlowers():
    # -------------------------------------------- Flowers information
    print(f'Merging flowers information...')
    FILE_NAME = 'plantFlowerItems.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url identification optimum_conditions pests_and_disorders images
    final schema:
    source url title description identification development damage management links
    '''

    df['source'         ] = 'infoFlowers'
    df['description'    ] = ''
    df['development'    ] = ''
    df['damage'         ] = ''

    df.rename(columns = {
        'name'              : 'title'       ,
        'optimum_conditions': 'management'  ,
        'images'            : 'links'       ,
    }, inplace = True)

    df['links'] = df.apply(lambda r: [
        {
            'type'  : 'image'   , 
            'src'   : i['src']  , 
            'link'  : i['link'] ,
            'title' : r['title'] + ' - ' + i['caption']
        } for i in r['links'] if len(i['caption']) > 0], axis = 1)

    df['pests_and_disorders'] = df.apply(lambda r: [
        {
            'type'  : 'problem' , 
            'src'   : i['link'] , 
            'link'  : ''        ,
            'title' : r['title'] + ' - ' + i['problem']
        } for i in r['pests_and_disorders'] if len(i['problem']) > 0], axis = 1)

    df.apply(lambda x: x['links'].extend(x['pests_and_disorders' ]), axis = 1)

    df = df[cols]

    return df

df      = infoFlowers()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def _clean(text):
    '''
    Fix encodings and remove escape and redundant whitespace characters from text.
    '''
    text = text.encode('ascii', 'ignore').decode()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

colsVector = ['title', 'description', 'identification', 'development', 'damage', 'management']
for c in colsVector:
    finalDf[c] = finalDf[c].apply(_clean)

print(f'Fix encodings and remove escape and redundant whitespace characters from text.')
print(f'------------------------------------------------')

print(f'Final dataframe shape: {finalDf.shape    }')
print(f'FINISHED')

finalDf.sample(5)

Merging pests diseases...
Final dataframe shape: (28, 9)
Merging pests diseases...
Final dataframe shape: (547, 9)
------------------------------------------------
Merging turf pests...
Final dataframe shape: (586, 9)
------------------------------------------------
Merging exotic pests...
Final dataframe shape: (617, 9)
------------------------------------------------
Merging fruit and veggie damages...
Final dataframe shape: (840, 9)
------------------------------------------------
Merging weed damages...
Final dataframe shape: (1014, 9)
------------------------------------------------
Merging fruits information...
Final dataframe shape: (1029, 9)
------------------------------------------------
Merging veggies information...
Final dataframe shape: (1060, 9)
------------------------------------------------
Merging flowers information...
Final dataframe shape: (1246, 9)
------------------------------------------------
Fix encodings and remove escape and redundant whitespace characters

Unnamed: 0,source,url,title,description,identification,development,damage,management,links
445,pestsDiseases,http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/DISEASE/...,Viral diseases,Loopers are often killed by a granulosis virus...,,,,,"[{'type': 'image', 'src': 'http://ipm.ucanr.ed..."
739,damagesEnvironment,http://ipm.ucanr.edu/PMG/GARDEN/ENVIRON/nonpar...,Nonparasitic gall,Nonparasitic galls are globular galls that occ...,,,,There is no control other than eliminating the...,[]
1138,infoFlowers,http://ipm.ucanr.edu/PMG/GARDEN/PLANTS/tanbark...,"Tanbark oak, Tanoak, Tan oak",,Tanbark oak is an evergreen tree that can grow...,,,Tanbark oaks are often grown as street or lawn...,"[{'type': 'image', 'src': 'http://ipm.ucanr.ed..."
55,pestsDiseases,http://ipm.ucanr.edu/PMG/GARDEN/VEGES/DISEASES...,Viral diseases,Virus diseases can be a very effective natural...,,,,,"[{'type': 'image', 'src': 'http://ipm.ucanr.ed..."
857,damagesWeed,http://ipm.ucanr.edu/PMG/WEEDS/willowherbs.htm...,Willowherbs,Willowherbs are native broadleaf plants but us...,,,,,"[{'type': 'image', 'src': 'http://ipm.ucanr.ed..."


## ETL of data

In [5]:
cols = ['source', 'url', 'title', 'description', 'identification', 'development', 'damage', 'management', 'links']

def infoFruitVegCultural():
    # -------------------------------------------- Fruit and veggie cultural tips
    print(f'Merging fruit and veggie cultural tips..')
    FILE_NAME = 'FruitVegCulturalItems.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url description images tips_table
    final schema:
    source url title description identification development damage management links
    '''

    df['source'         ] = 'infoFruitVegCultural'        
    df['identification' ] = ''
    df['development'    ] = ''
    df['damage'         ] = ''
    df['management'     ] = ''

    df.rename(columns = {
        'name'  : 'title',
        'images': 'links'
    }, inplace = True)

    df['links'] = df.apply(lambda r: [
        {
            'type'  : 'image'   , 
            'src'   : i['src']  , 
            'link'  : ''        ,
            'title' : r['title'] + ' - ' + i['caption']
        } for i in r['links'] if len(i['caption']) > 0], axis = 1)

    df['tips_table'] = df.apply(lambda r: [
        {
            'type'  : 'problem' , 
            'src'   : r['url'] , 
            'link'  : ''        ,
            'title' : r['title'] + ' - ' + i['header']
        } for i in r['tips_table'] if 'header' in i and len(i['header']) > 0], axis = 1)

    df.apply(lambda x: x['links'].extend(x['tips_table' ]), axis = 1)

    df = df[cols]
    
    return df

df      = infoFruitVegCultural()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def infoPestControl():
    # -------------------------------------------- Garden pest control
    print(f'Merging garden pest control information...')
    FILE_NAME = 'GardenControlsPestItems.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url description images
    final schema:
    source url title description identification development damage management links
    '''

    df['source'         ] = 'infoPestControl'
    df['identification' ] = ''
    df['development'    ] = ''
    df['damage'         ] = ''
    df['management'     ] = ''

    df.rename(columns = {
        'name'  : 'title',
        'images': 'links'
    }, inplace = True)

    df['links'] = df['links'].apply(lambda d: d if isinstance(d, list) else [])

    df['links'] = df.apply(lambda r: [
        {
            'type'  : 'image'                , 
            'src'   : i.get('src'       , ''), 
            'link'  : i.get('link'      , ''),
            'title' : r['title'] + ' - ' + i['caption']
        } for i in r['links'] if len(i['caption']) > 0], axis = 1)

    df = df[cols]
    
    return df

df      = infoPestControl()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def infoPesticideControl():
    # -------------------------------------------- Garden pesticide control
    print(f'Merging garden pesticide control information...')
    FILE_NAME = 'GardenControlsPesticideItems.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    active_ingredient url pesticide_type information
    final schema:
    source url title description identification development damage management links
    '''

    df['source'         ] = 'infoPesticideControl'
    df['identification' ] = ''
    df['development'    ] = ''
    df['damage'         ] = ''
    df['management'     ] = ''
    
    df['title'          ] = df[['active_ingredient', 'pesticide_type']].agg(' - '.join, axis=1)
    df['description'    ] = df['information'].str[0].apply(lambda x: x['associated_pests'])
    df['links'          ] = [[] for _ in range(len(df))]

    df = df[cols]

    return df

df      = infoPesticideControl()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def pestsNotes():
    # -------------------------------------------- Pests IPM
    print(f'Merging pests notes...')
    FILE_NAME = 'PestNotes.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name urlPestNote descriptionPestNote lifecyclePestNote damagePestNote managementPestNote imagePestNote tablePestNote
    final schema:
    source url title description identification development damage management links
    '''

    df['source'         ] = 'pestsNotes'
    df['identification' ] = ''

    df.rename(columns = {
        'urlPestNote'           : 'url'         ,
        'name'                  : 'title'       ,
        'descriptionPestNote'   : 'description' ,
        'lifecyclePestNote'     : 'development' ,
        'damagePestNote'        : 'damage'      ,
        'managementPestNote'    : 'management'  ,
        'imagePestNote'         : 'links'       ,
    }, inplace = True)

    df['links'] = df.apply(lambda r: [
        {
            'type'  : 'image'   , 
            'src'   : i['src']  , 
            'link'  : i['link'] ,
            'title' : r['title'] + ' - ' + i['caption']
        } for i in r['links'] if len(i['caption']) > 0], axis = 1)

    df = df[cols]
    
    return df

df      = pestsNotes()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def pestsQuickTips():
    # -------------------------------------------- Quick tips on pests
    print(f'Merging pests quick notes...')
    FILE_NAME = 'QuickTips.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name urlQuickTip contentQuickTips imageQuickTips
    final schema:
    source url title description identification development damage management links
    '''

    df['source'        ] = 'pestsQuickTips'
    df['identification'] = ''
    df['development'   ] = ''
    df['damage'        ] = ''
    df['management'    ] = ''

    df.rename(columns = {
        'urlQuickTip'           : 'url'         ,
        'name'                  : 'title'       ,
        'contentQuickTips'      : 'description' ,
        'imageQuickTips'        : 'links'       ,
    }, inplace = True)

    df['links'] = df.apply(lambda r: [
        {
            'type'  : 'image'   , 
            'src'   : i['src']  , 
            'link'  : i['link'] ,
            'title' : r['title'] + ' - ' + i['caption']
        } for i in r['links'] if len(i['caption']) > 0], axis = 1)

    df = df[cols]
    
    return df

df      = pestsQuickTips()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')

def pestsVideos():
    # -------------------------------------------- Videos of UC IPM YouTube data
    print(f'Merging UC IPM YouTube data...')
    FILE_NAME = 'Videos.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    title url description
    final schema:
    source url title description identification development damage management links
    '''

    df['source'         ] = 'pestsVideos'
    df['identification' ] = ''
    df['development'    ] = ''
    df['damage'         ] = ''
    df['management'     ] = ''

    df['links'          ] = [[] for _ in range(len(df))]

    df = df[cols]

    return df

df      = pestsVideos()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape    }')
print(f'------------------------------------------------')


def pestsWeed():
    # -------------------------------------------- Weed related pests
    print(f'Merging weed related pests...')
    FILE_NAME = 'WeedIdItems.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url description images
    final schema:
    source url title description identification development damage management links
    '''

    df['source'         ] = 'pestsWeed'
    df['identification' ] = ''
    df['development'    ] = ''
    df['damage'         ] = ''
    df['management'     ] = ''

    df.rename(columns = {
        'name'  : 'title',
        'images': 'links'
    }, inplace = True)

    df['links'] = df.apply(lambda r: [
        {
            'type'  : 'image'   , 
            'src'   : i['src']  , 
            'link'  : ''        ,
            'title' : r['title'] + ' - ' + i['caption']
        } for i in r['links'] if len(i['caption']) > 0], axis = 1)

    df = df[cols]

    return df

df      = pestsWeed()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')

def _clean(text):
    '''
    Fix encodings and remove escape and redundant whitespace characters from text.
    '''
    text = text.encode('ascii', 'ignore').decode()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

colsVector = ['title', 'description', 'identification', 'development', 'damage', 'management']
for c in colsVector:
    finalDf[c] = finalDf[c].apply(_clean)

print(f'Fix encodings and remove escape and redundant whitespace characters from text.')
print(f'------------------------------------------------')

print(f'Final dataframe shape: {finalDf.shape    }')
print(f'FINISHED')

finalDf.sample(5)

Merging fruit and veggie cultural tips..
Final dataframe shape: (1375, 9)
------------------------------------------------
Merging garden pest control information...
Final dataframe shape: (1395, 9)
------------------------------------------------
Merging garden pesticide control information...
Final dataframe shape: (1414, 9)
------------------------------------------------
Merging pests notes...
Final dataframe shape: (1586, 9)
------------------------------------------------
Merging pests quick notes...
Final dataframe shape: (1640, 9)
------------------------------------------------
Merging UC IPM YouTube data...
Final dataframe shape: (1698, 9)
------------------------------------------------
Merging weed related pests...
Final dataframe shape: (1726, 9)
------------------------------------------------
Fix encodings and remove escape and redundant whitespace characters from text.
------------------------------------------------
Final dataframe shape: (1726, 9)
FINISHED


Unnamed: 0,source,url,title,description,identification,development,damage,management,links
896,damagesWeed,http://ipm.ucanr.edu/PMG/WEEDS/longspine_sandb...,Longspine sandbur,"Longspine sandbur, a loosely clumped grass, is...",,,,,"[{'type': 'image', 'src': 'http://ipm.ucanr.ed..."
1594,pestsQuickTips,http://ipm.ucanr.edu/QT/groundsquirrelcard.htm...,Ground Squirrel,Pests in Gardens and Landscapes: Quick Tips Gr...,,,,,"[{'type': 'image', 'src': 'http://ipm.ucanr.ed..."
31,pestsDiseases,http://ipm.ucanr.edu/PMG/GARDEN/VEGES/PESTS/ca...,Carrot rust fly,The larval forms of the rust fly are stiff whi...,,,Carrot rust fly larvae cause surface tunnels i...,Plant carrots to avoid the egg-laying periods ...,"[{'type': 'image', 'src': 'http://ipm.ucanr.ed..."
82,pestsDiseases,http://ipm.ucanr.edu/PMG/GARDEN/VEGES/DISEASES...,Basal rot of onions and garlic,Plants affected by basal rot show progressive ...,,,,The basal rot fungus survives indefinitely in ...,"[{'type': 'image', 'src': 'http://ipm.ucanr.ed..."
50,pestsDiseases,http://ipm.ucanr.edu/PMG/GARDEN/VEGES/PESTS/sq...,Squash bug,"Adult squash bugs are 0.63 inch long, grayish ...",,Eggs are laid in the spring through midsummer ...,Leaves develop small specks that turn yellow a...,Remove all debris from the garden once the cro...,"[{'type': 'image', 'src': 'http://ipm.ucanr.ed..."


In [6]:
finalDf['url'] = finalDf['url'].apply(lambda x: f"{x}?src=exchbt" if '?src=exchbt' not in x else x)

In [7]:
finalDf

Unnamed: 0,source,url,title,description,identification,development,damage,management,links
0,naturalEnemies,https://www2.ipm.ucanr.edu/natural-enemies/ass...,Assassin Bugs,,Assassin bug adults and nymphs (immatures) hav...,True bugs develop through three life stages. E...,,,"[{'type': 'image', 'src': 'https://www2.ipm.uc..."
1,naturalEnemies,https://www2.ipm.ucanr.edu/natural-enemies/aph...,Aphid flies,,Adults are stocky flies with a body that is 1/...,Aphid flies develop through four life stages :...,,,"[{'type': 'image', 'src': 'https://www2.ipm.uc..."
2,naturalEnemies,https://www2.ipm.ucanr.edu/natural-enemies/aph...,Aphid midge,,Adults are delicate flies with long slender an...,Aphid midges develop through 4 life stages : e...,,,"[{'type': 'image', 'src': 'https://www2.ipm.uc..."
3,naturalEnemies,https://www2.ipm.ucanr.edu/natural-enemies/big...,Bigeyed bugs,,Bigeyed bug adults and nymphs (immatures) are ...,Bigeyed bugs develop through three life stages...,,,"[{'type': 'image', 'src': 'https://www2.ipm.uc..."
4,naturalEnemies,https://www2.ipm.ucanr.edu/natural-enemies/bro...,Brown lacewings,,Adults are soft-bodied and have 2 pairs of hai...,Lacewings develop though four life stages : eg...,,,"[{'type': 'image', 'src': 'https://www2.ipm.uc..."
...,...,...,...,...,...,...,...,...,...
1721,pestsWeed,http://ipm.ucanr.edu/PMG/WEEDS/ID/ligules.html...,Ligules,A ligule is an outgrowth from the sheath. Beca...,,,,,"[{'type': 'image', 'src': 'http://ipm.ucanr.ed..."
1722,pestsWeed,http://ipm.ucanr.edu/PMG/WEEDS/ID/flowerheads....,Grass flower heads,The flowering stem (flower head) is useful in ...,,,,,"[{'type': 'image', 'src': 'http://ipm.ucanr.ed..."
1723,pestsWeed,http://ipm.ucanr.edu/PMG/WEEDS/ID/growth.html?...,Growth habits (grasses),Grasses have either a bunchy growth habit or c...,,,,,"[{'type': 'image', 'src': 'http://ipm.ucanr.ed..."
1724,pestsWeed,http://ipm.ucanr.edu/PMG/WEEDS/ID/leaftips.htm...,Leaves (leaf blades and tips),"Leaves may be flat, rolled, or twisted, and va...",,,,,"[{'type': 'image', 'src': 'http://ipm.ucanr.ed..."


In [8]:
import pandas as pd 

label_df = pd.read_excel('ucipm_resources.xlsx')
labels = {row['url']: row['InsIde/OutsIde/Both'] for i, row in label_df.iterrows()}

In [9]:
labels

{'http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/ENVIRON/applefrost.html': 'O ',
 'http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/ENVIRON/frost.html': 'O',
 'http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/ENVIRON/appleburn.html': 'O',
 'http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/ENVIRON/applefertilizing.html': 'O',
 'http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/ENVIRON/almondtraining.html': 'O',
 'http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/ENVIRON/almondpruning.html': 'O',
 'http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/ENVIRON/frostinjury.html': 'O',
 'http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/ENVIRON/frostdamage.html': 'O',
 'http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/ENVIRON/figpruning.html': 'O',
 'http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/ENVIRON/figdroop.html': 'O',
 'http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/ENVIRON/dieback.html': 'O',
 'http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/ENVIRON/falsewart.html': 'O',
 'http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/ENVIRON/worming.html': 'O',
 'http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/ENVIRON/fgfertilizing.html': 'O',
 '

In [10]:
def apply_label(url):
    if url in labels:
        return labels[url]
    else:
        print(url)
        return ""
finalDf['label'] = finalDf.apply(lambda x: apply_label(x['url']), axis=1)

https://www2.ipm.ucanr.edu/natural-enemies/assassin-bugs?src=exchbt
https://www2.ipm.ucanr.edu/natural-enemies/aphid-flies?src=exchbt
https://www2.ipm.ucanr.edu/natural-enemies/aphid-midge?src=exchbt
https://www2.ipm.ucanr.edu/natural-enemies/bigeyed-bugs?src=exchbt
https://www2.ipm.ucanr.edu/natural-enemies/brown-lacewings?src=exchbt
https://www2.ipm.ucanr.edu/natural-enemies/convergent-lady-beetle?src=exchbt
https://www2.ipm.ucanr.edu/natural-enemies/damsel-bugs?src=exchbt
https://www2.ipm.ucanr.edu/natural-enemies/decollate-snail?src=exchbt
https://www2.ipm.ucanr.edu/natural-enemies/dustywings?src=exchbt
https://www2.ipm.ucanr.edu/natural-enemies/euseius-predatory-mites?src=exchbt
https://www2.ipm.ucanr.edu/natural-enemies/green-lacewings?src=exchbt
https://www2.ipm.ucanr.edu/natural-enemies/mantids?src=exchbt
https://www2.ipm.ucanr.edu/natural-enemies/mealybug-destroyer?src=exchbt
https://www2.ipm.ucanr.edu/natural-enemies/minute-pirate-bugs?src=exchbt
https://www2.ipm.ucanr.edu/na

In [11]:
finalDf['question'] = ""

In [12]:
finalDf.columns

Index(['source', 'url', 'title', 'description', 'identification',
       'development', 'damage', 'management', 'links', 'label', 'question'],
      dtype='object')

# Ask Extension Data ingestion
Loading in data from california only

In [13]:
df = pd.read_json('./chunked_data/ask_only.json')
STATE_FILTER    = ['California']
df['source'] =  'askExtension'
df = df[df['state'].isin(STATE_FILTER)]
df['identification' ] = ''
df['development'    ] = ''
df['damage'         ] = ''
df['management'     ] = ''
df['label'] = ''
df['links'] = df.apply(lambda r: [], axis=1)
df.rename(columns= {"subHead": "question", "text": "description"}, inplace=True)
df.drop(['state', 'thumbnail'], axis = 1, inplace = True)
df

Unnamed: 0,source,title,url,description,question,identification,development,damage,management,label,links
39,askExtension,Christmas trees,https://ask2.extension.org/kb/faq.php?id=110230,"Starting on January 1, 2013, California retail...",Is there a tax when trees are harvested and br...,,,,,,[]
70,askExtension,cattle guard a.k.a. Texas gate for feral pigs?,https://ask2.extension.org/kb/faq.php?id=110470,There has been a lot of research going on and ...,Has anyone looked into scaling and designing a...,,,,,,[]
101,askExtension,Desprouting Seed Potatoes,https://ask2.extension.org/kb/faq.php?id=110680,Hello form Ask an Expert. We are sorry for the...,When storing potatoes over the winter for spri...,,,,,,[]
102,askExtension,Sorghum for Syrup and Seed,https://ask2.extension.org/kb/faq.php?id=110681,All publicly available sweet sorghum varieties...,Are there any varieties of Sorghum (Sorghum bi...,,,,,,[]
588,askExtension,Plum tree,https://ask2.extension.org/kb/faq.php?id=113202,I would go with a Methley or a Bruce plum. Bob...,I would like to send a housewarming gift to my...,,,,,,[]
...,...,...,...,...,...,...,...,...,...,...,...
282996,askExtension,soil testing,https://ask2.extension.org/kb/faq.php?id=811727,"Yes, but the lab will be permanently closing a...","If I send in the sample, can you do soil testi...",,,,,,[]
282997,askExtension,soil testing,https://ask2.extension.org/kb/faq.php?id=811727,Go to soiltesting.msu.edu for information on h...,"If I send in the sample, can you do soil testi...",,,,,,[]
283640,askExtension,Iceberg rose has very small leaves,https://ask2.extension.org/kb/faq.php?id=812323,"Good Morning Bruce, I looked up iceberg rose a...",Please look at the photos attached and let me ...,,,,,,[]
285740,askExtension,Contaminated well water and parasites,https://ask2.extension.org/kb/faq.php?id=814446,"Thank you for your question. However, the topi...",I had water tested and it came back positive f...,,,,,,[]


In [14]:
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')

finalDf

Final dataframe shape: (4770, 11)
------------------------------------------------


Unnamed: 0,source,url,title,description,identification,development,damage,management,links,label,question
0,naturalEnemies,https://www2.ipm.ucanr.edu/natural-enemies/ass...,Assassin Bugs,,Assassin bug adults and nymphs (immatures) hav...,True bugs develop through three life stages. E...,,,"[{'type': 'image', 'src': 'https://www2.ipm.uc...",,
1,naturalEnemies,https://www2.ipm.ucanr.edu/natural-enemies/aph...,Aphid flies,,Adults are stocky flies with a body that is 1/...,Aphid flies develop through four life stages :...,,,"[{'type': 'image', 'src': 'https://www2.ipm.uc...",,
2,naturalEnemies,https://www2.ipm.ucanr.edu/natural-enemies/aph...,Aphid midge,,Adults are delicate flies with long slender an...,Aphid midges develop through 4 life stages : e...,,,"[{'type': 'image', 'src': 'https://www2.ipm.uc...",,
3,naturalEnemies,https://www2.ipm.ucanr.edu/natural-enemies/big...,Bigeyed bugs,,Bigeyed bug adults and nymphs (immatures) are ...,Bigeyed bugs develop through three life stages...,,,"[{'type': 'image', 'src': 'https://www2.ipm.uc...",,
4,naturalEnemies,https://www2.ipm.ucanr.edu/natural-enemies/bro...,Brown lacewings,,Adults are soft-bodied and have 2 pairs of hai...,Lacewings develop though four life stages : eg...,,,"[{'type': 'image', 'src': 'https://www2.ipm.uc...",,
...,...,...,...,...,...,...,...,...,...,...,...
4765,askExtension,https://ask2.extension.org/kb/faq.php?id=811727,soil testing,"Yes, but the lab will be permanently closing a...",,,,,[],,"If I send in the sample, can you do soil testi..."
4766,askExtension,https://ask2.extension.org/kb/faq.php?id=811727,soil testing,Go to soiltesting.msu.edu for information on h...,,,,,[],,"If I send in the sample, can you do soil testi..."
4767,askExtension,https://ask2.extension.org/kb/faq.php?id=812323,Iceberg rose has very small leaves,"Good Morning Bruce, I looked up iceberg rose a...",,,,,[],,Please look at the photos attached and let me ...
4768,askExtension,https://ask2.extension.org/kb/faq.php?id=814446,Contaminated well water and parasites,"Thank you for your question. However, the topi...",,,,,[],,I had water tested and it came back positive f...


In [15]:
from spacy.lang.en import English
import pickle
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch

# Loading model 
embed_url = "JeffEduworks/generalized_chatbot_model"
auth_token = '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'
embed = SentenceTransformer(embed_url, use_auth_token = auth_token, device = 'cuda')

# Initializing elastic search client
host_name = 'https://qa.ucipm.es.chat.ask.eduworks.com/'
host_name = 'https://dev.ucipm.es.chat.ask.eduworks.com/'
es_client = Elasticsearch([host_name], http_auth=('elastic', 'changeme'), timeout = 20)

# Loading synonym dictionary 
tokenizer = English().tokenizer
with open('./synonym_data/synonym_pest.pickle', 'rb') as handle:
    synonym_dict = pickle.load(handle)

  from .autonotebook import tqdm as notebook_tqdm


### Transforming textual data

In [16]:
CHUNK_SIZE      = 1
ROLLING_SIZE    = 3

colsVector = ['title', 'description', 'identification', 'development', 'damage', 'management']
print(f'Final DF: Transforming columns - {colsVector} and links titles.')

from spacy.lang.en import English 

nlp = English()
nlp.add_pipe('sentencizer')
raw_text = finalDf.iloc[0]['description']

print(f'STARTING TRANSFORMING')
c_items = []
for i, r in finalDf.iterrows():
    r_texts = []
    for c in colsVector:
        t = r[c]
        
        doc = nlp(t)
        
        ts = [sent for sent in doc.sents]
        if len(ts) == 0:
            continue
        else:
            chunks, chunk_size, roll_size = len(ts), CHUNK_SIZE, ROLLING_SIZE
            ts = [ts[i1:i1+chunk_size+(roll_size - 1)] for i1 in range(0, chunks - (roll_size - 1), chunk_size)]
            ts = [{'text': ' '.join([l2.text for l2 in l1]), 'name': c + '_' + str(i1), 'start': l1[0].start_char, 'end': l1[-1].end_char} for i1, l1 in enumerate(ts)]
                
        r_texts.extend(ts)

    try:
        ts = [ i1['title'] for i1 in r['links']]
    except:
        print(r)
        raise ValueError('error')
    if len(ts) == 0:
        c_items.append(r_texts)
        if (i+1) % 500 == 0:
            print(f'Finished transforming of {i+1} rows of dataframe')
        continue

    for i1, v in enumerate(ts):
        r_texts.append({'text': v, 'name': 'links_' + str(i1), 'start': 0, 'end': -1})
    
    c_items.append(r_texts)

    if (i+1) % 500 == 0:
        print(f'Finished transforming of {i+1} rows of dataframe')

print(f'Finished transforming of {i+1} rows of dataframe')
print(f'FINISHED TRANSFORMING')

Final DF: Transforming columns - ['title', 'description', 'identification', 'development', 'damage', 'management'] and links titles.
STARTING TRANSFORMING
Finished transforming of 500 rows of dataframe
Finished transforming of 1000 rows of dataframe
Finished transforming of 1500 rows of dataframe
Finished transforming of 2000 rows of dataframe
Finished transforming of 2500 rows of dataframe
Finished transforming of 3000 rows of dataframe
Finished transforming of 3500 rows of dataframe
Finished transforming of 4000 rows of dataframe
Finished transforming of 4500 rows of dataframe
Finished transforming of 4770 rows of dataframe
FINISHED TRANSFORMING


### Replacing synonyms

In [17]:
print(f'START REPLACING')

texts = [r1['text'] for r in c_items for r1 in r]
texts_modified = []

for i, text in enumerate(texts):
    tokens = tokenizer(text)
    modified = ""
    replace = False
    for token in tokens:
        t = token.text.lower()
        
        if t in synonym_dict:
            modified += synonym_dict[t]
            modified += token.whitespace_
            replace = True
        else:
            modified += token.text_with_ws
    
    if not replace:
        modified = text
    texts_modified.append(modified)

    if (i+1) % 10000 == 0:
        print(f'Finished replacing synonyms of {i+1} items of sentences')

print(f'Finished replacing synonyms of {i+1} items of sentences')
print(f'FINISHED REPLACING')

START REPLACING
Finished replacing synonyms of 10000 items of sentences
Finished replacing synonyms of 20000 items of sentences
Finished replacing synonyms of 30000 items of sentences
Finished replacing synonyms of 40000 items of sentences
Finished replacing synonyms of 50000 items of sentences
Finished replacing synonyms of 53610 items of sentences
FINISHED REPLACING


### Checking for invalid links

In [18]:
count_ucipm = 0
count_askextension = 0
for i, r in finalDf.iterrows():
    if len(r['url']) < 10:
        print(f'Source with no main link at row {i} of data frame, main link - {r["url"]}.')
    links = r['links']
    no_link = False
    show_main_url = False
    for l in links:
        url = r['url']
        if len(l['src']) < 10:
            no_link = True
            if not show_main_url:
                show_main_url = True
                print(f'Links at {url}')
            print(l)
    if no_link:
        if r['source'] == 'askExtension':
            count_askextension += 1
        else:
            count_ucipm += 1

print(f'Number of sources from AskExtension with no link urls - {count_askextension}')
print(f'Number of sources from UC IPM with no link urls - {count_ucipm}')

Links at http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/walnut.html?src=exchbt
{'type': 'problem', 'src': '', 'link': '', 'title': 'Walnuts - Dusky-veined walnut aphid'}
{'type': 'problem', 'src': '', 'link': '', 'title': 'Walnuts - Walnut aphid'}
{'type': 'problem', 'src': '', 'link': '', 'title': 'Walnuts - San Jose scale'}
{'type': 'problem', 'src': '', 'link': '', 'title': 'Walnuts - Walnut scale'}
{'type': 'problem', 'src': '', 'link': '', 'title': 'Walnuts - Bark beetles and borers'}
{'type': 'problem', 'src': '', 'link': '', 'title': 'Walnuts - Mites'}
{'type': 'problem', 'src': '', 'link': '', 'title': 'Walnuts - European fruit lecanium'}
{'type': 'problem', 'src': '', 'link': '', 'title': 'Walnuts - Frosted scale'}
Links at http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/citrus.html?src=exchbt
{'type': 'problem', 'src': '', 'link': '', 'title': 'Citrus - California red scale'}
{'type': 'problem', 'src': '', 'link': '', 'title': 'Citrus - Purple scale'}
{'type': 'problem', 'src': '', 'link': '', 

### Embedding textual data

In [19]:
finalDf.to_json('final.json', orient='records', indent=4)

In [20]:
BATCH_SIZE = 64

print(f'STARTING EMBEDDING')

finalDf['vectors'] = np.empty((len(finalDf), 0)).tolist()

# Sentence Encoder model        
vectors = embed.encode(
    sentences           = texts_modified,
    batch_size          = BATCH_SIZE    ,
    show_progress_bar   = True
).tolist()

index = 0
for i, r in enumerate(c_items):
    for i1, r1 in enumerate(r):
        r1['vector'] = vectors[index]
        r1.pop('text')
        index += 1

print(f'FINISHED EMBEDDING')
print(finalDf.columns)
finalDf['vectors'] = c_items
print(f'The number of vectors to be ingested: {len([r1["vector"] for r in finalDf["vectors"] for r1 in r])}')        
finalDf.sample(5)

STARTING EMBEDDING


Batches: 100%|██████████| 838/838 [01:06<00:00, 12.53it/s]


FINISHED EMBEDDING
Index(['source', 'url', 'title', 'description', 'identification',
       'development', 'damage', 'management', 'links', 'label', 'question',
       'vectors'],
      dtype='object')
The number of vectors to be ingested: 53610


Unnamed: 0,source,url,title,description,identification,development,damage,management,links,label,question,vectors
3000,askExtension,https://ask2.extension.org/kb/faq.php?id=333463,Can you tell me what kind of tree this is?,It looks like it might be some kind of a beech...,,,,,[],,I am trying to figure out what kind of tree th...,"[{'name': 'description_0', 'start': 0, 'end': ..."
1379,infoPestControl,http://ipm.ucanr.edu/PMG/GARDEN/CONTROLS/virus...,Viral diseases,Caterpillars are often killed by diseases caus...,,,,,"[{'type': 'image', 'src': 'http://ipm.ucanr.ed...",,,"[{'name': 'description_0', 'start': 0, 'end': ..."
1160,infoFlowers,http://ipm.ucanr.edu/PMG/GARDEN/PLANTS/honeylo...,Honey locust,,Plant identification Honey locust trees are de...,,,"Honey locusts are fast growing, hardy, and ada...","[{'type': 'image', 'src': 'http://ipm.ucanr.ed...",,,"[{'name': 'identification_0', 'start': 0, 'end..."
2561,askExtension,https://ask2.extension.org/kb/faq.php?id=286504,Orchid Cacti cuttings planted upside-down?,1 to 2 inches deep in the soil. Two (and only ...,,,,,[],,Cal Peterson has a great instruction sheet onl...,"[{'name': 'description_0', 'start': 0, 'end': ..."
4284,askExtension,https://ask2.extension.org/kb/faq.php?id=632795,What are these tiny bugs?,The enlarged image looks like a collembola to ...,,,,,[],,These are everywhere on my window sills and pa...,"[{'name': 'description_0', 'start': 0, 'end': ..."


## Ingesting data into ES

In [21]:
# Different embedding sizes depending on the models
# VECTOR_SIZE = 384
# VECTOR_SIZE = 512
VECTOR_SIZE = 768

mapping  = {
    "settings": {"number_of_shards": 2, "number_of_replicas": 1},
    "mappings": {
        "dynamic"   : "false",
        "_source"   : {"enabled": "true"},
        "properties": {
            "source"        : {"type": "keyword", "index": "true" , "ignore_above": 32766},
            "url"           : {"type": "keyword", "index": "false", "ignore_above": 32766},
            "label"        : {"type": "keyword", "index": "false",  "ignore_above": 32766},
            "question"      : {"type": "keyword", "index": "false", "ignore_above": 32766},
            "title"         : {"type": "keyword", "index": "false", "ignore_above": 32766},
            "description"   : {"type": "keyword", "index": "false", "ignore_above": 32766},
            "identification": {"type": "keyword", "index": "false", "ignore_above": 32766},
            "development"   : {"type": "keyword", "index": "false", "ignore_above": 32766},
            "damage"        : {"type": "keyword", "index": "false", "ignore_above": 32766},
            "management"    : {"type": "keyword", "index": "false", "ignore_above": 32766},
            "vectors"       : {
                "type"      : "nested",
                "properties": {
                    "vector": {
                        "type": "dense_vector", 
                        "dims": VECTOR_SIZE
                    },
                    "name"  : {"type": "keyword", "index": "false", "ignore_above": 32766},
                    "start" : {"type": "integer"                                         },
                    "end"   : {"type": "integer"                                         },
                }
            },
            "links"         : {
                "type"      : "nested",
                "properties": {
                    "type"  : {"type": "keyword", "index": "false", "ignore_above": 32766},
                    "src"   : {"type": "keyword", "index": "false", "ignore_above": 32766},
                    "link"  : {"type": "keyword", "index": "false", "ignore_above": 32766},
                    "title" : {"type": "keyword", "index": "false", "ignore_above": 32766}
                }
            }
        }
    }
}

final_json = finalDf.to_dict('records')


In [27]:
import json
for item in final_json:
    if "spider mite rove" in item['title'].lower():
        print(json.dumps(item, indent=4))
        break

{
    "source": "naturalEnemies",
    "url": "https://www2.ipm.ucanr.edu/natural-enemies/spider-mite-rove-beetle?src=exchbt",
    "title": "Spider mite rove beetle",
    "description": "",
    "identification": "The adult is black and about 1/25 inch (1 mm) long. It has shortened wing covers that expose the abdominal segments when viewed from above. Characteristically, Oligota adults have a pointed abdomen that curves upward at the rear end and a head that is bent downward or under the body so the head is not apparent when viewed from above. Eggs are oval, yellowish to pale orange, and a little over 1/100 inch (0.3 mm) long. Oligota eggs commonly are hidden beneath the skins of mites. The female rove beetle covers her eggs with the skins immediately after she lays her eggs. Larvae are cylindrical, elongate, pale yellowish, and covered with fine spines. Larvae have distinct segments and a prominent black spot (hardened dorsal plates) on top the eighth abdominal segment, near the rear en

In [22]:
set([type(item['damage']) for item in final_json ])

{str}

In [23]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk

from collections import deque
index = 'combined'
es_client.indices.delete(
    index   = index, 
    ignore  = 404)
es_client.indices.refresh()
es_client.indices.create(
    index       = 'combined' , 
    settings    = mapping['settings']       , 
    mappings    = mapping['mappings']       )

# play with chunk size parameter for timed out problem
deque(parallel_bulk(es_client, actions = final_json, index = 'combined' , max_chunk_bytes = 5 * 1024 * 1024), maxlen = 0)

es_client.indices.refresh()

{'_shards': {'total': 8, 'successful': 4, 'failed': 0}}