# EDA and ETL for scraped data from IPM and AskExtension data knowledge

In [None]:
import os
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

COLOR = 'white'
plt.rcParams['text.color'       ] = COLOR
plt.rcParams['text.color'       ] = COLOR
plt.rcParams['axes.labelcolor'  ] = COLOR
plt.rcParams['xtick.color'      ] = COLOR
plt.rcParams['ytick.color'      ] = COLOR

# IPM data - December 2021 Scrape

In [None]:
'''
['exoticPests.json',
 'fruitItems_new.json',
 'fruitVeggieEnvironItems_new.json',
 'ipmdata_new.json',
 'pestDiseaseItems_new.json',
 'plantFlowerItems.json',
 'turfPests.json',
 'veggieItems_new.json',
 'weedItems.json']
'''
_PATH = '../data/uc-ipm/updated-Dec2021/'
DATA_FILE_NAMES = sorted(os.listdir(_PATH))

## Pests - IPM data

In [None]:
FILE_NAME = 'ipmdata_new.json'
df = pd.read_json(_PATH + FILE_NAME)
df.info()

In [None]:
print('Field with less than 50% non-null entries:')
for c in df.columns:
    if df[df[c].str.len() > 0].shape[0] / df[c].shape[0] < 0.5:
        print(f'{c:<20} - {(df[df[c].str.len() > 0].shape[0] / df[c].shape[0]) * 100:.0f}%')

In [None]:
df.sample(5)

In [None]:
text_columns = ['name', 'descriptionPestNote', 'life_cycle', 'damagePestNote', 'managementPestNote', 'contentQuickTips']
fig, axes = plt.subplots(2, 3, figsize = (30, 10))
for i, col in enumerate(text_columns):
    r = i//3
    c = i%3
    df[col].apply(len).hist(figure = fig, bins = 20, ax = axes[r][c])
    axes[r][c].set_title(col, fontdict = {'fontsize': 20})
plt.show()

### IPM Data - `ipmdata_new.json`

| column              | type                                  |
|---------------------|---------------------------------------|
| name                | string                                |
| urlPestNote         | string                                |
| descriptionPestNote | string                                |
| life_cycle          | string                                |
| damagePestNote      | string                                |
| managementPestNote  | string                                |
| imagePestNote       | [{link: " ", src: " ", caption: " "}] |
| tablePestNote       | [" ", " "]                            |
| urlQuickTip         | string                                |
| contentQuickTips    | string                                |
| imageQuickTips      | [{link: " ", src: " ", caption: " "}] |
| video               | [{videoLink: " ", videoTitle: " "}]   |

Example of the single JSON data entry:
```json
{
    "name": "Thrips",
    "urlPestNote": "http://ipm.ucanr.edu/PMG/PESTNOTES/pn7429.html?src=exchbt",
    "descriptionPestNote": "Thrips, order Thysanoptera, are tiny, slender insects with fringed wings....",
    "life_cycle": "Most adult thrips are elongate, slender, minute (less than 1/20 inch long), and h...",
    "damagePestNote": "Thrips feeding on plants can damage fruit, leaves, and shoots and very notice...",
    "managementPestNote": "Thrips are difficult to control. If management is necessary, use an integ...",
    "imagePestNote": [
        {
            "caption": "Stippling as result of greenhouse thrips feeding.",
            "link": "http://ipm.ucanr.edu/PMG/H/I-TS-HHAE-CD.013.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/H/I-TS-HHAE-CD.013h.jpg?src=exchbt"
        },
        {
            "caption": "Black feces and white feeding scars from thrips.",
            "link": "http://ipm.ucanr.edu/PMG/F/I-TS-FOCC-CD.008.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/F/I-TS-FOCC-CD.008h.jpg?src=exchbt"
        },
        ...
    ],
    "tablePestNote": [...],
    "urlQuickTip": "http://ipm.ucanr.edu/QT/thripscard.html?src=exchbt",
    "contentQuickTips": "When thrips feed, they distort or scar leaves, flowers, or fruit. Healthy w...",
    "imageQuickTips": [
        {
            "link": "http://ipm.ucanr.edu/PMG/F/I-TS-FOCC-AD.010.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/F/I-TS-FOCC-AD.010h.jpg?src=exchbt",
            "caption": "Adult western flower thrips."
        },
        {
            "link": "http://ipm.ucanr.edu/PMG/H/I-TS-HHAE-CO.004.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/H/I-TS-HHAE-CO.004h.jpg?src=exchbt",
            "caption": "Greenhouse thrips adults (black) and nymphs (yellow)."
        },
        ...
    ],
    "video": [
      {
        "videoLink": "https://www.youtube.com/watch?v=oXkp90opkjU?src=exchbt",
        "videoTitle": "Monitoring for Thrips in Avocado Orchards"
      },
      {
        "videoLink": "https://youtu.be/l2GnmTjQLp0?src=exchbt",
        "videoTitle": "UC Ag Experts Talk: Citrus Thrips"
      },
      ...
    ]
}
```
### Metadata on data source

Data about the pests. `descriptionPestNote` can be used to describe the pest. `damagePestNote` can be used to match damage description. Target can extracted from main fields as well as image and video captions. Following slots can be used for filter - `plant_name`, `problem`, `problem_description`, `target`. 
Notes:
* `descriptionPestNote`, `life_cycle`, `managementPestNote`, `contentQuickTips` - main fields.
* `imagePestNote/caption`, `imageQuickTips/caption`, `video/title` - additional fields.


## Pests - causing diseases

In [None]:
FILE_NAME = 'pestDiseaseItems_new.json'
df = pd.read_json(_PATH + FILE_NAME)
df.info()

In [None]:
print('Field with less than 50% non-null entries:')
for c in df.columns:
    if df[df[c].str.len() > 0].shape[0] / df[c].shape[0] < 0.5:
        print(f'{c:<20} - {(df[df[c].str.len() > 0].shape[0] / df[c].shape[0]) * 100:.0f}%')

In [None]:
df.sample(5)

In [None]:
text_columns = ['name', 'description', 'identification', 'life_cycle', 'damage', 'solutions']
fig, axes = plt.subplots(2, 3, figsize = (30, 10))
for i, col in enumerate(text_columns):
    r = i//3
    c = i%3
    df[col].apply(len).hist(figure = fig, bins = 20, ax = axes[r][c])
    axes[r][c].set_title(col, fontdict = {'fontsize': 20})
plt.show()

### Pest Diseases - `pestDiseasesItems_new.json`

| column         | type                                  |
|----------------|---------------------------------------|
| name           | string                                |
| url            | string                                |
| description    | string                                |
| identification | string                                |
| life_cycle     | string                                |
| damage         | string                                |
| solutions      | string                                |
| images         | [{link: " ", src: " ", caption: " "}] |


Example of the single JSON data entry:

```json
{
    "name": "Stink bugs",
    "url": "http://ipm.ucanr.edu/PMG/GARDEN/VEGES/PESTS/stinkbug.html?src=exchbt",
    "description": "These sucking insects (family Pentatomidae) are shield shaped. Their common ...",
    "identification": "Adult stink bugs are distinguished from other insects by the large ...",
    "life_cycle": "Stink bugs develop through three life stages: egg, nymph, and adult. ...",
    "damage": "Adults suck and feed on plants with their strawlike mouthparts. Stink bugs ...",
    "solutions": "Handpick the bugs and their eggs from small plants. Eliminate groundcovers...",
    "images": [
        {
            "link": "http://ipm.ucanr.edu/PMG/E/I-HM-ECON-AD.015.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/E/I-HM-ECON-AD.015a.jpg?src=exchbt",
            "caption": "Adults of southern green stink bug (left), redshouldered stink bug, (upper right) and consperse stink bug."
        },
        {
            "link": "http://ipm.ucanr.edu/PMG/E/I-HM-ECON-EG.001.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/E/I-HM-ECON-EG.001b.jpg?src=exchbt",
            "caption": "First instars of consperse stink bug and their empty egg cases."
        },
        ...
    ]
},
```

### Metadata on data source

Information on pests.

## Pests - turf (grass)

In [None]:
FILE_NAME = 'turfPests.json'
df = pd.read_json(_PATH + FILE_NAME)
df.info()

In [None]:
print('Field with less than 50% non-null entries:')
for c in df.columns:
    if df[df[c].str.len() > 0].shape[0] / df[c].shape[0] < 0.5:
        print(f'{c:<20} - {(df[df[c].str.len() > 0].shape[0] / df[c].shape[0]) * 100:.0f}%')

In [None]:
df.sample(5)

In [None]:
text_columns = ['name', 'text']
fig, axes = plt.subplots(1, 2, figsize = (30, 5))
for i, col in enumerate(text_columns):
    c = i%2
    df[col].apply(len).hist(figure = fig, bins = 20, ax = axes[c])
    axes[c].set_title(col, fontdict = {'fontsize': 20})
plt.show()

### Turf Pests - `turfPests.json`

| column | type                                  |
|--------|---------------------------------------|
| name   | string                                |
| url    | string                                |
| text   | string                                |
| images | [{link: " ", src: " ", caption: " "}] |

Example of the single JSON data entry:

```json
{
    "name": "Fiery skipper",
    "url": "http://ipm.ucanr.edu/TOOLS/TURF/PESTS/inskipper.html?src=exchbt",
    "text": "Identification Fiery skipper adults resemble butterflies and are 1 inch...",
    "images": [
        {
            "link": "http://ipm.ucanr.edu/PMG/H/I-LP-HPHY-AD.003.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/TOOLS/TURF/IMAGES/PESTMANIM/infieryad.jpg?src=exchbt",
            "caption": "Fiery skipper adult"
        },
        {
            "link": "http://ipm.ucanr.edu/PMG/H/I-LP-HPHY-LV.009.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/TOOLS/TURF/IMAGES/PESTMANIM/inskiplv.jpg?src=exchbt",
            "caption": "Skipper larva with dark head and thoracic shield"
        },
        ...
    ]
},
```
### Metadata on data source

Inoformation on turf pests (grass).

## Pests - exotic types

In [None]:
FILE_NAME = 'exoticPests.json'
df = pd.read_json(_PATH + FILE_NAME)
df.info()

In [None]:
print('Field with less than 50% non-null entries:')
for c in df.columns:
    if df[df[c].str.len() > 0].shape[0] / df[c].shape[0] < 0.5:
        print(f'{c:<20} - {(df[df[c].str.len() > 0].shape[0] / df[c].shape[0]) * 100:.0f}%')

In [None]:
df.sample(5)

In [None]:
text_columns = ['name', 'description', 'damage', 'identification', 'life_cycle', 'monitoring', 'management']
fig, axes = plt.subplots(3, 3, figsize = (30, 15))
for i, col in enumerate(text_columns):
    r = i//3
    c = i%3
    df[col].apply(len).hist(figure = fig, bins = 20, ax = axes[r][c])
    axes[r][c].set_title(col, fontdict = {'fontsize': 20})
plt.show()

### Exotic pests - `exoticPests.json`

| column         | type                     |
|----------------|--------------------------|
| name           | string                   |
| url            | string                   |
| description    | string                   |
| damage         | string                   |
| identification | string                   |
| life_cycle     | string                   |
| monitoring     | string                   |
| management     | string                   |
| related_links  | [{text: " ", link: " "}] |

Example of the single JSON data entry:
```json
{
    "name": "European Grapevine Moth",
    "url": "https://www2.ipm.ucanr.edu/Invasive-and-Exotic-Pests/European-grapevine-moth/?src=exchbt",
    "description": "Lobesia botrana, the European grapevine moth, was first reported in the United ...",
    "damage": "In May and June, first-generation larvae web and feed on the flower clusters. Secon...",
    "identification": "The adult moth is approximately 0.24 to 0.3 inch (6-8 mm) long, with a wing...",
    "life_cycle": "European grapevine moth has two generations in its life cycle in northern Europ...",
    "monitoring": "Sex pheromone attracts males and is used to monitor male flights. Before bud br...",
    "management": "In countries where L. botrana is established, control measures are targeted at ...",
    "related_links": [
        {
        "text": "Grape pest management guidelines",
        "link": "http://ipm.ucanr.edu/PMG/selectnewpest.grapes.html?src=exchbt"
        },
        {
        "text": "Video presentation",
        "link": "http://stream.ucanr.org/ipm_ag_urban/evgm2011/?src=exchbt"
        }
    ],
    "images": [
        {
        "link": "http://ipm.ucanr.edu/PMG/L/I-LP-LBOT-AD.002.html?src=exchbt",
        "src": "http://ipm.ucanr.edu/PMG/IMAGES/L/I-LP-LBOT-AD.002h.jpg?src=exchbt",
        "caption": "Adult female European grapevine moth."
        },
        {
        "link": "http://ipm.ucanr.edu/PMG/L/I-LP-LBOT-CD.004.html?src=exchbt",
        "src": "http://ipm.ucanr.edu/PMG/IMAGES/L/I-LP-LBOT-CD.004h.jpg?src=exchbt",
        "caption": "Grape bunches with webbing, frass, and fungal infections."
        },
        ...
    ]
}
```

### Metadata on data source

Information on exotic pests.

Notes:
* `description`, `damage`, `identification`, `life_cycle`, `monitoring`, `management` - main fields.
* `related_links/text`, `images/caption` - additional fields.
    
Out of 15-20 source, only 2 have description field filled.

## Environmental damages - fruits and veggies

In [None]:
FILE_NAME = 'fruitVeggieEnvironItems_new.json'
df = pd.read_json(_PATH + FILE_NAME)
df.info()

In [None]:
print('Field with less than 50% non-null entries:')
for c in df.columns:
    if df[df[c].str.len() > 0].shape[0] / df[c].shape[0] < 0.5:
        print(f'{c:<20} - {(df[df[c].str.len() > 0].shape[0] / df[c].shape[0]) * 100:.0f}%')

In [None]:
df.sample(5)

In [None]:
text_columns = ['name', 'description', 'identification', 'damage', 'disorder_development', 'solutions']
fig, axes = plt.subplots(2, 3, figsize = (30, 10))
for i, col in enumerate(text_columns):
    r = i//3
    c = i%3
    df[col].apply(len).hist(figure = fig, bins = 20, ax = axes[r][c])
    axes[r][c].set_title(col, fontdict = {'fontsize': 20})
plt.show()

### Environment Fruit and Veggie - `fruitVeggieEnvironItems_new.json`

| column               | type                                  |
|----------------------|---------------------------------------|
| name                 | string                                |
| url                  | string                                |
| description          | string                                |
| identification       | string                                |
| damage               | string                                |
| disorder_development | string                                |
| solutions            | string                                |
| images               | [{link: " ", src: " ", caption: " "}] |

Example of the single JSON data entry:
```json
{
    "name": "Wind",
    "url": "http://ipm.ucanr.edu/PMG/GARDEN/ENVIRON/wind.html?src=exchbt",
    "description": "Wind can damage bark, flowers, foliage, fruit, and limbs of most any...",
    "identification": "Plants growing at windy sites often have smaller-than-normal leaves...",
    "damage": "Wind-damaged leaves become necrotic along the margins and tips and drop prema...",
    "disorder_development": "Wind commonly causes water deficit. If soil moisture is low, or w...",
    "solutions": "Provide plants with proper cultural care, especially appropriate irrigation...",
    "images": [
        {
            "link": "http://ipm.ucanr.edu/PMG/W/A-WO-WEAT-FS.002.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/W/A-WO-WEAT-FS.002a.jpg?src=exchbt",
            "caption": "Wind-sculptured cypress"
        },
        {
            "link": "http://ipm.ucanr.edu/PMG/W/A-WO-WEAT-FO.041.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/W/A-WO-WEAT-FO.041a.jpg?src=exchbt",
            "caption": "Necrotic, wind-tattered leaves"
        },
        ...
    ]
}
```

### Metadata on data source

Environment caused problems on veggies and fruites. `problem_description` can be used to find matches.

Notes:
* `name`, `description`, `identification`, `damage`, `disorder_development`, `solutions` - main fields
* `images/caption` - additional fields

## Environmental damages - weeds

In [None]:
FILE_NAME = 'weedItems.json'
df = pd.read_json(_PATH + FILE_NAME)
df.info()

In [None]:
print('Field with less than 50% non-null entries:')
for c in df.columns:
    if df[df[c].str.len() > 0].shape[0] / df[c].shape[0] < 0.5:
        print(f'{c:<20} - {(df[df[c].str.len() > 0].shape[0] / df[c].shape[0]) * 100:.0f}%')

In [None]:
df.sample(5)

In [None]:
text_columns = ['name', 'description']
fig, axes = plt.subplots(1, 2, figsize = (30, 5))
for i, col in enumerate(text_columns):
    c = i%2
    df[col].apply(len).hist(figure = fig, bins = 20, ax = axes[c])
    axes[c].set_title(col, fontdict = {'fontsize': 20})
plt.show()

### Weed - `weedItems.json`

| column      | type                        |
|-------------|-----------------------------|
| name        | string                      |
| url         | string                      |
| description | string                      |
| images      | [{link: " ", caption: " "}] |

Example of the single JSON data entry:
```json
{
    "name": "Gregg arrowhead",
    "url": "http://ipm.ucanr.edu/PMG/WEEDS/gregg_arrowhead.html?src=exchbt",
    "description": "Gregg arrowhead is a native aquatic perennial that occurs in the...",
    "images": [
        {
            "link": "http://ipm.ucanr.edu/PMG/S/W-AL-SMON-MP.003.html?src=exchbt",
            "caption": "top picture"
        },
        {
            "link": "http://ipm.ucanr.edu/PMG/S/W-AL-SLON-SG.001.html?src=exchbt",
            "caption": "bottom left picture"
        },
        ...
    ]
}
```

### Metadata on data source

Description of the weed (problematic). Weed is any plant growing in cultivated ground to the injury of the crop or desired vegetation.

## Information - fruits

In [None]:
FILE_NAME = 'fruitItems_new.json'
df = pd.read_json(_PATH + FILE_NAME)
df.info()

In [None]:
print('Field with less than 50% non-null entries:')
for c in df.columns:
    if df[df[c].str.len() > 0].shape[0] / df[c].shape[0] < 0.5:
        print(f'{c:<20} - {(df[df[c].str.len() > 0].shape[0] / df[c].shape[0]) * 100:.0f}%')

In [None]:
df.sample(5)

### Fruits - `fruitItems_new.json`

| column              | type                      |
|---------------------|---------------------------|
| name                | string                    |
| url                 | string                    |
| cultural_tips       | [{tip: "", link: ""}]     |
| pests_and_disorders | [{problem: "", link: ""}] |

Example of the single JSON data entry:
```json
{
    "name": "Figs",
    "url": "http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/figs.html?src=exchbt",
    "cultural_tips": [
        {
            "tip": "Fertilizing",
            "link": "http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/CULTURAL/fgfertilizing.html?src=exchbt"
        },
        {
            "tip": "First-year pruning",
            "link": "http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/CULTURAL/almondfirst.html?src=exchbt"
        },
        ...
    ],
    "pests_and_disorders": [
        {
            "problem": "Ants",
            "link": "http://ipm.ucanr.edu/PMG/PESTNOTES/pn7411.html?src=exchbt"
        },
        {
            "problem": "Carpenterworm",
            "link": "http://ipm.ucanr.edu/PMG/PESTNOTES/pn74105.html?src=exchbt"
        },
        ...
    ]
}
```
### Metadata on data source

Information on fruits. Name with cultural tips and problems can be used to search for the queries. Following slots can be used - `plant_name`, `problem`.

Notes:
* Concatenate `name` and `cultural_tips/tip` - additional field.
* Concatenate `name` and `pests_and_disorders` - additional field.

## Information - veggies

In [None]:
FILE_NAME = 'veggieItems_new.json'
df = pd.read_json(_PATH + FILE_NAME)
df.info()

In [None]:
print('Field with less than 50% non-null entries:')
for c in df.columns:
    if df[df[c].str.len() > 0].shape[0] / df[c].shape[0] < 0.5:
        print(f'{c:<20} - {(df[df[c].str.len() > 0].shape[0] / df[c].shape[0]) * 100:.0f}%')

In [None]:
df.sample(5)

In [None]:
text_columns = ['name', 'description', 'tips']
fig, axes = plt.subplots(1, 3, figsize = (30, 5))
for i, col in enumerate(text_columns):
    c = i%3
    df[col].apply(len).hist(figure = fig, bins = 20, ax = axes[c])
    axes[c].set_title(col, fontdict = {'fontsize': 20})
plt.show()

### Veggie - `veggieItems_new.json`

| column              | type                                  |
|---------------------|---------------------------------------|
| name                | string                                |
| url                 | string                                |
| description         | string                                |
| tips                | string                                |
| images              | [{link: " ", src: " ", caption: " "}] |
| pests_and_disorders | [{problem: "", link: ""}]             |

Example of the single JSON data entry:
```json
{
    "name": "Carrot ",
    "url": "http://ipm.ucanr.edu/home-and-landscape/carrot/index.html?src=exchbt",
    "description": "Carrots can be grown nearly year-round throughout California. A number of vari...",
    "tips": "Cultural practices such as proper site selection, soil preparation, planting, and wat...",
    "images": [
        {
            "link": "http://ipm.ucanr.edu/PMG/P/D-CA-PVIO-RO.002.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/P/D-CA-PVIO-RO.002.jpg?src=exchbt",
            "caption": "Cavity spot-induced, irregularly shaped lesions across mature carrot tap roots."
        },
        ...
    ],
    "pests_and_disorders": [
        {
            "problem": "Aphids",
            "link": "http://ipm.ucanr.edu/PMG/PESTNOTES/pn7404.html?src=exchbt"
        },
        {
            "problem": "Carrot rust fly",
            "link": "http://ipm.ucanr.edu/PMG/GARDEN/VEGES/PESTS/carrotrustfly.html?src=exchbt"
        },
        ...
    ]
}
```

### Metadata on data source

Information on vegetables. `plant_name`, and `problem` can be used to search this data source.

Notes:
* `name`, `description`, `tips` - main fields.
* `images/caption` additional field.
* concatenate `name` + `pests_and_disorders` -  additional field.


## Information - flowers

In [None]:
FILE_NAME = 'plantFlowerItems.json'
df = pd.read_json(_PATH + FILE_NAME)
df.info()

In [None]:
print('Field with less than 50% non-null entries:')
for c in df.columns:
    if df[df[c].str.len() > 0].shape[0] / df[c].shape[0] < 0.5:
        print(f'{c:<20} - {(df[df[c].str.len() > 0].shape[0] / df[c].shape[0]) * 100:.0f}%')

In [None]:
df.sample(5)

In [None]:
text_columns = ['name', 'identification', 'optimum_conditions']
fig, axes = plt.subplots(1, 3, figsize = (30, 5))
for i, col in enumerate(text_columns):
    c = i%3
    df[col].apply(len).hist(figure = fig, bins = 20, ax = axes[c])
    axes[c].set_title(col, fontdict = {'fontsize': 20})
plt.show()

### Flowers - `plantFlowerItems.json`

| column              | type                        |
|---------------------|-----------------------------|
| name                | string                      |
| url                 | string                      |
| identification      | string                      |
| optimum_conditions  | string                      |
| pests_and_disorders | [{problem: " ", link: " "}] |

Example of the single JSON data entry:
```json
{
    "name": "Abelia",
    "url": "http://ipm.ucanr.edu/PMG/GARDEN/PLANTS/abelia.html?src=exchbt",
    "identification": "Abelias are evergreen or semievergreen deciduous shrubs. Leaves are ...",
    "optimum_conditions": "Abelias can be planted as borders or barriers. Varieties that ...",
    "pests_and_disorders": [
        {
            "problem": "Root knot nematodes",
            "link": "http://ipm.ucanr.edu/PMG/PESTNOTES/pn7489.html?src=exchbt"
        },
        {
            "problem": "Mineral deficiencies",
            "link": "http://ipm.ucanr.edu/PMG/GARDEN/ENVIRON/mineraldef.html?src=exchbt"
        },
        ...
    ],
    "images": [
        {
            "link": "http://ipm.ucanr.edu/PMG/C/S-WO-CAPR-FL.017.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/C/S-WO-CAPR-FL.017a.jpg?src=exchbt",
            "caption": "Abelia flowers"
        },
        {
            "link": "http://ipm.ucanr.edu/PMG/C/S-WO-CAPR-FO.011.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/C/S-WO-CAPR-FO.011a.jpg?src=exchbt",
            "caption": "LEaves of Abelia"
        }
    ]
}
```
### Metadata on data source

Description of the flowers.

* `name` and `pest_and_disorders` - main fields
* `images/caption` - additional fields


## Final transformations


There is going to be single merged index consisting of the following:

### `Problems` index: 
```json
[
    {
        "source": "PestsIPM",
        "name": "Thrips",                                                                                           # title
        "urlPestNote": "http://ipm.ucanr.edu/PMG/PESTNOTES/pn7429.html?src=exchbt",                                 # url
        "descriptionPestNote": "Thrips, order Thysanoptera, are tiny, slender insects with fringed wings...",       # description
        "life_cycle": "Most adult thrips are elongate, slender, minute (less than 1/20 inch long), and h...",       # development
        "damagePestNote": "Thrips feeding on plants can damage fruit, leaves, and shoots and very notice...",       # damage
        "managementPestNote": "Thrips are difficult to control. If management is necessary, use an integ...",       # management
        "imagePestNote": [                                                                                          # links (type - images)
            {
                "caption": "Stippling as result of greenhouse thrips feeding.",
                "link": "http://ipm.ucanr.edu/PMG/H/I-TS-HHAE-CD.013.html?src=exchbt",
                "src": "http://ipm.ucanr.edu/PMG/IMAGES/H/I-TS-HHAE-CD.013h.jpg?src=exchbt"
            },
            {
                "caption": "Black feces and white feeding scars from thrips.",
                "link": "http://ipm.ucanr.edu/PMG/F/I-TS-FOCC-CD.008.html?src=exchbt",
                "src": "http://ipm.ucanr.edu/PMG/IMAGES/F/I-TS-FOCC-CD.008h.jpg?src=exchbt"
            },
            ...
        ],
        "tablePestNote": [...],                                                                                     # omit
        "urlQuickTip": "http://ipm.ucanr.edu/QT/thripscard.html?src=exchbt",                                        # fill main URL if absent
        "contentQuickTips": "When thrips feed, they distort or scar leaves, flowers, or fruit. Healthy w...",       # fill description, damage, management if absent
        "imageQuickTips": [                                                                                         # merge with links (link type - images)
            {
                "link": "http://ipm.ucanr.edu/PMG/F/I-TS-FOCC-AD.010.html?src=exchbt",
                "src": "http://ipm.ucanr.edu/PMG/IMAGES/F/I-TS-FOCC-AD.010h.jpg?src=exchbt",
                "caption": "Adult western flower thrips."
            },
            {
                "link": "http://ipm.ucanr.edu/PMG/H/I-TS-HHAE-CO.004.html?src=exchbt",
                "src": "http://ipm.ucanr.edu/PMG/IMAGES/H/I-TS-HHAE-CO.004h.jpg?src=exchbt",
                "caption": "Greenhouse thrips adults (black) and nymphs (yellow)."
            },
            ...
        ],
        "video": [                                                                                                  # merge with images (link type - video)
            {
                "videoLink": "https://www.youtube.com/watch?v=oXkp90opkjU?src=exchbt",
                "videoTitle": "Monitoring for Thrips in Avocado Orchards"
            },
            {
                "videoLink": "https://youtu.be/l2GnmTjQLp0?src=exchbt",
                "videoTitle": "UC Ag Experts Talk: Citrus Thrips"
            },
            ...
        ]
    },
    {
        "source": "PestsDiseases",
        "name": "Stink bugs",                                                                                       # title
        "url": "http://ipm.ucanr.edu/PMG/GARDEN/VEGES/PESTS/stinkbug.html?src=exchbt",                              # url
        "description": "These sucking insects (family Pentatomidae) are shield shaped. The...",                     # description
        "identification": "Adult stink bugs are distinguished from other insects by the la...",                     # identification
        "life_cycle": "Stink bugs develop through three life stages: egg, nymph, and adult...",                     # development
        "damage": "Adults suck and feed on plants with their strawlike mouthparts. Stink b...",                     # damage
        "solutions": "Handpick the bugs and their eggs from small plants. Eliminate ground...",                     # management
        "images": [                                                                                                 # links (type - images)
            {
                "link": "http://ipm.ucanr.edu/PMG/E/I-HM-ECON-AD.015.html?src=exchbt",
                "src": "http://ipm.ucanr.edu/PMG/IMAGES/E/I-HM-ECON-AD.015a.jpg?src=exchbt",
                "caption": "Adults of southern green stink bug (left), redshouldered stink..."
            },
            {
                "link": "http://ipm.ucanr.edu/PMG/E/I-HM-ECON-EG.001.html?src=exchbt",
                "src": "http://ipm.ucanr.edu/PMG/IMAGES/E/I-HM-ECON-EG.001b.jpg?src=exchbt",
                "caption": "First instars of consperse stink bug and their empty egg cases."
            },
            ...
        ]
    },
    {
        "source": "PestsTurf",
        "name": "Fiery skipper",                                                                                    # title
        "url": "http://ipm.ucanr.edu/TOOLS/TURF/PESTS/inskipper.html?src=exchbt",                                   # url
        "text": "Identification Fiery skipper adults resemble butterflies and are 1 inch...",                       # description
        "images": [                                                                                                 # links (type - images)
            {
                "link": "http://ipm.ucanr.edu/PMG/H/I-LP-HPHY-AD.003.html?src=exchbt",
                "src": "http://ipm.ucanr.edu/TOOLS/TURF/IMAGES/PESTMANIM/infieryad.jpg?src=exchbt",
                "caption": "Fiery skipper adult"
            },
            {
                "link": "http://ipm.ucanr.edu/PMG/H/I-LP-HPHY-LV.009.html?src=exchbt",
                "src": "http://ipm.ucanr.edu/TOOLS/TURF/IMAGES/PESTMANIM/inskiplv.jpg?src=exchbt",
                "caption": "Skipper larva with dark head and thoracic shield"
            },
            ...
        ]
    },
    {
        "source": "PestsExotic",
        "name": "European Grapevine Moth",                                                                          # title
        "url": "https://www2.ipm.ucanr.edu/Invasive-and-Exotic-Pests/European-grapevine-moth/?src=exchbt",          # url
        "description": "Lobesia botrana, the European grapevine moth, was first reported in the United...",         # desciption
        "damage": "In May and June, first-generation larvae web and feed on the flower clusters. Secon...",         # damage
        "identification": "The adult moth is approximately 0.24 to 0.3 inch (6-8 mm) long, with a wing...",         # identification
        "life_cycle": "European grapevine moth has two generations in its life cycle in northern Europ...",         # development
        "monitoring": "Sex pheromone attracts males and is used to monitor male flights. Before bud br...",         # monitoring (can be omitted - only 3%)
        "management": "In countries where L. botrana is established, control measures are targeted at ...",         # management
        "related_links": [                                                                                          # links (type - page)
            {
            "text": "Grape pest management guidelines",
            "link": "http://ipm.ucanr.edu/PMG/selectnewpest.grapes.html?src=exchbt"
            },
            {
            "text": "Video presentation",
            "link": "http://stream.ucanr.org/ipm_ag_urban/evgm2011/?src=exchbt"
            },
            ...
            ],
        "images": [                                                                                                 # merge with links (type - images)
            {
            "link": "http://ipm.ucanr.edu/PMG/L/I-LP-LBOT-AD.002.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/L/I-LP-LBOT-AD.002h.jpg?src=exchbt",
            "caption": "Adult female European grapevine moth."
            },
            {
            "link": "http://ipm.ucanr.edu/PMG/L/I-LP-LBOT-CD.004.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/L/I-LP-LBOT-CD.004h.jpg?src=exchbt",
            "caption": "Grape bunches with webbing, frass, and fungal infections."
            },
            ...
        ]
    },
    {
        "source": "DamagesEnvironment",
        "name": "Wind",                                                                                             # title
        "url": "http://ipm.ucanr.edu/PMG/GARDEN/ENVIRON/wind.html?src=exchbt",                                      # url
        "description": "Wind can damage bark, flowers, foliage, fruit, and limbs of most any...",                   # description
        "identification": "Plants growing at windy sites often have smaller-than-normal leav...",                   # identification
        "damage": "Wind-damaged leaves become necrotic along the margins and tips and drop p...",                   # damage
        "disorder_development": "Wind commonly causes water deficit. If soil moisture is low...",                   # development
        "solutions": "Provide plants with proper cultural care, especially appropriate irrig...",                   # management
        "images": [                                                                                                 # links (type - images)
            {
                "link": "http://ipm.ucanr.edu/PMG/W/A-WO-WEAT-FS.002.html?src=exchbt",
                "src": "http://ipm.ucanr.edu/PMG/IMAGES/W/A-WO-WEAT-FS.002a.jpg?src=exchbt",
                "caption": "Wind-sculptured cypress"
            },
            {
                "link": "http://ipm.ucanr.edu/PMG/W/A-WO-WEAT-FO.041.html?src=exchbt",
                "src": "http://ipm.ucanr.edu/PMG/IMAGES/W/A-WO-WEAT-FO.041a.jpg?src=exchbt",
                "caption": "Necrotic, wind-tattered leaves"
            },
            ...
        ]
    },
    {
        "source": "DamagesWeed",
        "name": "Gregg arrowhead",                                                                                  # title
        "url": "http://ipm.ucanr.edu/PMG/WEEDS/gregg_arrowhead.html?src=exchbt",                                    # url
        "description": "Gregg arrowhead is a native aquatic perennial that occurs in the...",                       # description
        "images": [                                                                                                 # links (type - images)
            {
                "link": "http://ipm.ucanr.edu/PMG/S/W-AL-SMON-MP.003.html?src=exchbt",
                "caption": "top picture"
            },
            {
                "link": "http://ipm.ucanr.edu/PMG/S/W-AL-SLON-SG.001.html?src=exchbt",
                "caption": "bottom left picture"
            },
            ...
        ]
    }
]
```

### `Information` index:
```json
[
    {
        "source": "Fruits",
        "name": "Figs",                                                                                             # title
        "url": "http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/figs.html?src=exchbt",                                        # url
        "cultural_tips": [                                                                                          # links (type - tips)
            {
                "tip": "Fertilizing",
                "link": "http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/CULTURAL/fgfertilizing.html?src=exchbt"
            },
            {
                "tip": "First-year pruning",
                "link": "http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/CULTURAL/almondfirst.html?src=exchbt"
            },
            ...
        ],
        "pests_and_disorders": [                                                                                    # merge with links (type - problems)
            {
                "problem": "Ants",
                "link": "http://ipm.ucanr.edu/PMG/PESTNOTES/pn7411.html?src=exchbt"
            },
            {
                "problem": "Carpenterworm",
                "link": "http://ipm.ucanr.edu/PMG/PESTNOTES/pn74105.html?src=exchbt"
            },
            ...
        ]
    },
    {
        "source": "Veggies",
        "name": "Carrot ",                                                                                          # title
        "url": "http://ipm.ucanr.edu/home-and-landscape/carrot/index.html?src=exchbt",                              # url
        "description": "Carrots can be grown nearly year-round throughout California. A number of vari...",         # description
        "tips": "Cultural practices such as proper site selection, soil preparation, planting, and wat...",         # management
        "images": [                                                                                                 # links (type - images)
            {
                "link": "http://ipm.ucanr.edu/PMG/P/D-CA-PVIO-RO.002.html?src=exchbt",
                "src": "http://ipm.ucanr.edu/PMG/IMAGES/P/D-CA-PVIO-RO.002.jpg?src=exchbt",
                "caption": "Cavity spot-induced, irregularly shaped lesions across mature carrot tap r..."
            },
            ...
        ],
        "pests_and_disorders": [                                                                                    # merge with links (type - problems)
            {
                "problem": "Aphids",
                "link": "http://ipm.ucanr.edu/PMG/PESTNOTES/pn7404.html?src=exchbt"
            },
            {
                "problem": "Carrot rust fly",
                "link": "http://ipm.ucanr.edu/PMG/GARDEN/VEGES/PESTS/carrotrustfly.html?src=exchbt"
            },
            ...
        ]
    },
    {
        "source": "Flowers",
        "name": "Abelia",                                                                                           # title
        "url": "http://ipm.ucanr.edu/PMG/GARDEN/PLANTS/abelia.html?src=exchbt",                                     # url
        "identification": "Abelias are evergreen or semievergreen deciduous shrubs. Leaves are ...",                # description
        "optimum_conditions": "Abelias can be planted as borders or barriers. Varieties that ...",                  # management
        "pests_and_disorders": [                                                                                    # merge with links (type - problems)
            {
                "problem": "Root knot nematodes",
                "link": "http://ipm.ucanr.edu/PMG/PESTNOTES/pn7489.html?src=exchbt"
            },
            {
                "problem": "Mineral deficiencies",
                "link": "http://ipm.ucanr.edu/PMG/GARDEN/ENVIRON/mineraldef.html?src=exchbt"
            },
            ...
        ],
        "images": [                                                                                                 # links (type - images)
            {
                "link": "http://ipm.ucanr.edu/PMG/C/S-WO-CAPR-FL.017.html?src=exchbt",
                "src": "http://ipm.ucanr.edu/PMG/IMAGES/C/S-WO-CAPR-FL.017a.jpg?src=exchbt",
                "caption": "Abelia flowers"
            },
            {
                "link": "http://ipm.ucanr.edu/PMG/C/S-WO-CAPR-FO.011.html?src=exchbt",
                "src": "http://ipm.ucanr.edu/PMG/IMAGES/C/S-WO-CAPR-FO.011a.jpg?src=exchbt",
                "caption": "LEaves of Abelia"
            }
        ]
    }
]
```


### Final mappings of 2 indexes

`Combined` index:
```json
{
    "source"        : "pestsIPM/pestsDiseases/pestsTurf/pestsExotic/damagesEnvironment/damagesWeed/infoFruits/infoVeggies/infoFlowers",
    "name"          : "text",
    "url"           : "url",
    "description"   : "text",
    "identification": "text",
    "development"   : "text",
    "damage"        : "text",
    "management"    : "text",
    "links": [
        {
            "type"      : "images/video/page/tips/problems",
            "title"     : "...",
            "src"       : "urlSource",
            "link"      : "urlAdditional"
        },
        ...
    ]
    # columns from AskExtension
    # source    : askextension
    # title     : name
    # question  : description
    # tags      : links (type - tag)
    # answers   : links (type - answer)
}
```

## ETL of data

In [None]:
finalDf = pd.DataFrame()
cols = ['source', 'url', 'title', 'description', 'identification', 'development', 'damage', 'management', 'links']

def pestsIPM():
    # -------------------------------------------- Pests IPM
    print(f'Merging pests IPM...')
    FILE_NAME = 'ipmdata_new.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name urlPestNote descriptionPestNote life_cycle damagePestNote managementPestNote imagePestNote tablePestNote urlQuickTip contentQuickTips imageQuickTips video
    final schema:
    source url title description identification development damage management links
    '''

    df['source']            = 'pestsIPM'
    df['identification']    = ''

    df.rename(columns = {
        'urlPestNote'           : 'url'         ,
        'name'                  : 'title'       ,
        'descriptionPestNote'   : 'description' ,
        'life_cycle'            : 'development' ,
        'damagePestNote'        : 'damage'      ,
        'managementPestNote'    : 'management'  ,
        'imagePestNote'         : 'links'       ,
    }, inplace = True)

    df.loc[df['url'         ] == '', 'url'          ] = df[df['url'          ] == '']['urlQuickTip'      ]
    df.loc[df['description' ] == '', 'description'  ] = df[df['description'  ] == '']['contentQuickTips' ]

    df['links'] = df['links'].apply(lambda l: [
        {
            'type'  : 'image'       , 
            'src'   : i['src']      , 
            'link'  : i['link']     ,
            'title' : i['caption']  
        } for i in l])

    df['imageQuickTips'] = df['imageQuickTips'].apply(lambda l: [
        {
            'type'  : 'image'       , 
            'src'   : i['src']      , 
            'link'  : i['link']     ,
            'title' : i['caption']  
        } for i in l])

    df['video'] = df['video'].apply(lambda l: [
        {
            'type'  : 'video'           , 
            'src'   : i['videoLink']    , 
            'link'  : ''                ,
            'title' : i['videoTitle']  
        } for i in l])

    df.apply(lambda x: x['links'].extend(x['imageQuickTips' ]), axis = 1)
    df.apply(lambda x: x['links'].extend(x['video'          ]), axis = 1)

    df = df[cols]
    
    return df

df      = pestsIPM()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def pestsDiseases():
    # -------------------------------------------- Pests diseases
    print(f'Merging pests diseases...')
    FILE_NAME = 'pestDiseaseItems_new.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url description identification life_cycle damage solutions images
    final schema:
    source url title description identification development damage management links
    '''

    df['source'] = 'pestsDiseases'

    df.rename(columns = {
        'name'          : 'title'       ,
        'life_cycle'    : 'development' ,
        'damagePestNote': 'damage'      ,
        'solutions'     : 'management'  ,
        'images'        : 'links'       ,
    }, inplace = True)


    df['links'] = df['links'].apply(lambda l: [
        {
            'type'  : 'image'       , 
            'src'   : i['src']      , 
            'link'  : i['link']     ,
            'title' : i['caption']  
        } for i in l])

    df = df[cols]

    return df

df      = pestsDiseases()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def pestsTurf():
    # -------------------------------------------- Turf pests
    print(f'Merging turf pests...')
    FILE_NAME = 'turfPests.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url text images
    final schema:
    source url title description identification development damage management links
    '''

    df['source'         ] = 'pestsTurf'
    df['identification' ] = ''
    df['development'    ] = ''
    df['damage'         ] = ''
    df['management'     ] = ''

    df.rename(columns = {
        'name'  : 'title'       ,
        'text'  : 'description' ,
        'images': 'links'       ,
    }, inplace = True)


    df['links'] = df['links'].apply(lambda l: [
        {
            'type'  : 'image'       , 
            'src'   : i['src']      , 
            'link'  : i['link']     ,
            'title' : i['caption']  
        } for i in l])

    df = df[cols]

    return df

df      = pestsTurf()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def pestsExotic():
    # -------------------------------------------- Exotic pests
    print(f'Merging exotic pests...')
    FILE_NAME = 'exoticPests.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url description damage identification life_cycle monitoring management related_links images
    final schema:
    source url title description identification development damage management links
    '''

    df['source'] = 'pestsExotic'

    df.rename(columns = {
        'name'      : 'title'       ,
        'life_cycle': 'development' ,
        'images'    : 'links'      ,
    }, inplace = True)

    df['links'] = df['links'].apply(lambda d: d if isinstance(d, list) else [])

    df['links'] = df['links'].apply(lambda l: [
        {
            'type'  : 'image'       , 
            'src'   : i['src']      , 
            'link'  : i['link']     ,
            'title' : i['caption']  
        } for i in l])

    df['related_links'] = df['related_links'].apply(lambda l: [
        {
            'type'  : 'page'    , 
            'src'   : i['link'] , 
            'link'  : ''        ,
            'title' : i['text']  
        } for i in l])

    df.apply(lambda x: x['links'].extend(x['related_links' ]), axis = 1)

    df = df[cols]

    return df

df      = pestsExotic()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def damagesEnvironment():
    # -------------------------------------------- Fruit and veggie damages
    print(f'Merging fruit and veggie damages...')
    FILE_NAME = 'fruitVeggieEnvironItems_new.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url description identification damage disorder_development solutions images
    final schema:
    source url title description identification development damage management links
    '''

    df['source'] = 'damagesEnvironment'

    df.rename(columns = {
        'name'                  : 'title'       ,
        'disorder_development'  : 'development' ,
        'solutions'             : 'management' ,
        'images'                : 'links'      ,
    }, inplace = True)

    df['links'] = df['links'].apply(lambda l: [
        {
            'type'  : 'image'       , 
            'src'   : i['src']      , 
            'link'  : i['link']     ,
            'title' : i['caption']  
        } for i in l])

    df = df[cols]

    return df

df      = damagesEnvironment()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def damagesWeed():
    # -------------------------------------------- Weed damages
    print(f'Merging weed damages...')
    FILE_NAME = 'weedItems.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url description images
    final schema:
    source url title description identification development damage management links
    '''

    df['source'         ] = 'damagesWeed'
    df['identification' ] = ''
    df['development'    ] = ''
    df['damage'         ] = ''
    df['management'     ] = ''

    df.rename(columns = {
        'name'  : 'title',
        'images': 'links'
    }, inplace = True)

    df['links'] = df['links'].apply(lambda l: [
        {
            'type'  : 'image'       , 
            'src'   : i['link']     , 
            'link'  : ''            ,
            'title' : i['caption']  
        } for i in l])

    df = df[cols]

    return df

df      = damagesWeed()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def infoFruits():
    # -------------------------------------------- Fruits information
    print(f'Merging fruits information...')
    FILE_NAME = 'fruitItems_new.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url cultural_tips pests_and_disorders
    final schema:
    source url title description identification development damage management links
    '''

    df['source'         ] = 'infoFruits'
    df['description'    ] = ''
    df['identification' ] = ''
    df['development'    ] = ''
    df['damage'         ] = ''
    df['management'     ] = ''
    
    df.rename(columns = {
        'name'          : 'title',
        'cultural_tips' : 'links'
    }, inplace = True)

    df['links'] = df['links'].apply(lambda d: d if isinstance(d, list) else [])

    df['links'] = df['links'].apply(lambda l: [
        {
            'type'  : 'tips'    , 
            'src'   : i['link'] , 
            'link'  : ''        ,
            'title' : i['tip']  
        } for i in l])

    df['pests_and_disorders'] = df['pests_and_disorders'].apply(lambda l: [
        {
            'type'  : 'problem' ,
            'src'   : i['link'] , 
            'link'  : ''        ,
            'title' : i['problem']  
        } for i in l])

    df.apply(lambda x: x['links'].extend(x['pests_and_disorders' ]), axis = 1)

    df = df[cols]
    return df

df      = infoFruits()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def infoVeggies():
    # -------------------------------------------- Veggies information
    print(f'Merging veggies information...')
    FILE_NAME = 'veggieItems_new.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url description tips images pests_and_disorders
    final schema:
    source url title description identification development damage management links
    '''

    df['source'         ] = 'infoVeggies'
    df['identification' ] = ''
    df['development'    ] = ''
    df['damage'         ] = ''
    
    df.rename(columns = {
        'name'  : 'title'       ,
        'tips'  : 'management'  ,
        'images': 'links'       ,
    }, inplace = True)

    df['links'] = df['links'].apply(lambda l: [
        {
            'type'  : 'images'      , 
            'src'   : i['src']      , 
            'link'  : i['link']     ,
            'title' : i['caption']  
        } for i in l])

    df['pests_and_disorders'] = df['pests_and_disorders'].apply(lambda l: [
        {
            'type'  : 'problem' ,
            'src'   : i['link'] , 
            'link'  : ''        ,
            'title' : i['problem']  
        } for i in l])

    df.apply(lambda x: x['links'].extend(x['pests_and_disorders' ]), axis = 1)

    df = df[cols]

    return df

df      = infoVeggies()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def infoFlowers():
    # -------------------------------------------- Flowers information
    print(f'Merging flowers information...')
    FILE_NAME = 'plantFlowerItems.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url identification optimum_conditions pests_and_disorders images
    final schema:
    source url title description identification development damage management links
    '''

    df['source'         ] = 'infoFlowers'
    df['description'    ] = ''
    df['development'    ] = ''
    df['damage'         ] = ''

    df.rename(columns = {
        'name'              : 'title'       ,
        'optimum_conditions': 'management'  ,
        'images'            : 'links'       ,
    }, inplace = True)

    df['links'] = df['links'].apply(lambda l: [
        {
            'type'  : 'images'      , 
            'src'   : i['src']      , 
            'link'  : i['link']     ,
            'title' : i['caption']  
        } for i in l])

    df['pests_and_disorders'] = df['pests_and_disorders'].apply(lambda l: [
        {
            'type'  : 'problem' ,
            'src'   : i['link'] , 
            'link'  : ''        ,
            'title' : i['problem']  
        } for i in l])

    df.apply(lambda x: x['links'].extend(x['pests_and_disorders' ]), axis = 1)

    df = df[cols]

    return df

df      = infoFlowers()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def _clean(text):
    '''
    Fix encodings and remove escape and redundant whitespace characters from text.
    '''
    text = text.encode('ascii', 'ignore').decode()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

colsVector = ['title', 'description', 'identification', 'development', 'damage', 'management']
for c in colsVector:
    finalDf[c] = finalDf[c].apply(_clean)

print(f'Fix encodings and remove escape and redundant whitespace characters from text.')
print(f'------------------------------------------------')

print(f'Final dataframe shape: {finalDf.shape    }')
print(f'FINISHED')

finalDf.sample(5)

# IPM data - April 2022 Scrape

In [None]:
'''
['FruitVegCulturalItems.json',
 'GardenControlsPestItems.json',
 'GardenControlsPesticideItems.json',
 'WeedIdItems.json']
'''
_PATH = '../data/uc-ipm/updated-Apr2022/'
DATA_FILE_NAMES = sorted(os.listdir(_PATH))

## Fruit and veggie cultural items

In [None]:
FILE_NAME = 'FruitVegCulturalItems.json'
df = pd.read_json(_PATH + FILE_NAME)
df.info()

In [None]:
print('Field with less than 50% non-null entries:')
for c in df.columns:
    if df[df[c].str.len() > 0].shape[0] / df[c].shape[0] < 0.5:
        print(f'{c:<20} - {(df[df[c].str.len() > 0].shape[0] / df[c].shape[0]) * 100:.0f}%')

In [None]:
df.sample(5)

In [None]:
text_columns = ['name','description', 'images', 'tips_table']
fig, axes = plt.subplots(1, 4, figsize = (30, 5))
for i, col in enumerate(text_columns):
    df[col].apply(len).hist(figure = fig, bins = 30, ax = axes[i])
    axes[i].set_title(col, fontdict = {'fontsize': 20})
plt.show()

### IPM Data - `FruitVegCulturalItems.json`

| column      | type                                       |
|-------------|--------------------------------------------|
| name        | string                                     |
| url         | string                                     |
| description | string                                     |
| images      | [{src: " ", caption: " "}]                 |
| tips_table  | [{header: " ", row: " "(, row: " ", ...)}] |

Example of the single JSON data entry:
```json
{
    "name": "Planting cucurbits",
    "url": "http://ipm.ucanr.edu/PMG/GARDEN/VEGES/CULTURAL/cantaloupeplant.html?src=exchbt",
    "description": "Cucurbits can be seeded directly or transplanted into the garden. It is ...",
    "images": [
        {
            "src": "http://ipm.ucanr.edu/PMG/GARDEN/IMAGES/CULTURAL/transplant.jpg?src=exchbt",
            "caption": "Transplanting"
        }
    ],
    "tips_table": [
        {
            "header": "Planting tips"
        },
        {
            "row": "Distance in inches "
        },
        {
            "row": "Between plants in rows Between rows "
        },
        {
            "row": "Cucumbers 24 48 "
        },
        {
            "row": "Melons 12 72 "
        },
        {
            "row": "Pumpkins 48 72 "
        },
        {
            "row": "Squash 48 48-72"
        }
    ]
}
```
### Metadata on data source

Information on fruits and veggies for cultivation purposes.

## Garden control - pest related items

In [None]:
FILE_NAME = 'GardenControlsPestItems.json'
df = pd.read_json(_PATH + FILE_NAME)
df.info()

In [None]:
print('Field with less than 50% non-null entries:')
for c in df.columns:
    if df[df[c].str.len() > 0].shape[0] / df[c].shape[0] < 0.5:
        print(f'{c:<20} - {(df[df[c].str.len() > 0].shape[0] / df[c].shape[0]) * 100:.0f}%')

In [None]:
df.sample(5)

In [None]:
text_columns = ['name','description', 'images']
fig, axes = plt.subplots(1, 3, figsize = (30, 5))
for i, col in enumerate(text_columns):
    df[col].apply(len).hist(figure = fig, bins = 30, ax = axes[i])
    axes[i].set_title(col, fontdict = {'fontsize': 20})
plt.show()

### IPM Data - `GardenControlsPestItems.json`

| column      | type                                       |
|-------------|--------------------------------------------|
| name        | string                                     |
| url         | string                                     |
| description | string                                     |
| images      | [{src: " ", caption: " "}]                 |

Example of the single JSON data entry:
```json
{
    "name": "Parasites",
    "url": "http://ipm.ucanr.edu/PMG/GARDEN/CONTROLS/parasites.html?src=exchbt",
    "description": "Insect parasites (parasitoids) are smaller than their hosts and develop inside, or attached to the ...",
    "images": [
        {
            "link": "http://ipm.ucanr.edu/PMG/S/I-LP-SCON-AS.001.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/S/I-LP-SCON-AS.001a.jpg?src=exchbt",
            "caption": "Apanteles cocoons"
        },
        {
            "link": "",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/PESTICIDES/parasitelifecycle.jpg?src=exchbt",
            "caption": "Life cycle of a Hyposoter parasite"
        },
        {
            "link": "http://ipm.ucanr.edu/PMG/S/I-LP-SCON-HF.002.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/S/I-LP-SCON-HF.002a.jpg?src=exchbt",
            "caption": "Redhumped caterpillars parasitized by Hyposoter wasps"
        }
    ]
}
```
### Metadata on data source

Information on pest control remedies from pest perspective.

## Garden control - pesticide related items

In [None]:
FILE_NAME = 'GardenControlsPesticideItems.json'
df = pd.read_json(_PATH + FILE_NAME)
df.info()

In [None]:
print('Field with less than 50% non-null entries:')
for c in df.columns:
    if df[df[c].str.len() > 0].shape[0] / df[c].shape[0] < 0.5:
        print(f'{c:<20} - {(df[df[c].str.len() > 0].shape[0] / df[c].shape[0]) * 100:.0f}%')

In [None]:
df.sample(5)

In [None]:
text_columns = ['active_ingredient','pesticide_type', 'information']
fig, axes = plt.subplots(1, 3, figsize = (30, 5))
for i, col in enumerate(text_columns):
    df[col].apply(len).hist(figure = fig, bins = 30, ax = axes[i])
    axes[i].set_title(col, fontdict = {'fontsize': 20})
plt.show()

In [None]:
information_columns = ["acute_toxicity", "long_term_toxicity", "water_quality_rating", "impact_on_natural_enemies", "impact_on_honey_bees", "associated_pests"]
fig, axes = plt.subplots(2, 3, figsize = (30, 10))
for i, col in enumerate(information_columns):
    r = i//3
    c = i%3
    df['information'].apply(lambda x: x[0][col]).apply(len).hist(figure = fig, bins = 30, ax = axes[r][c])
    axes[r][c].set_title(col, fontdict = {'fontsize': 20})
plt.show()

### IPM Data - `GardenControlsPesticideItems.json`

| column            | type                                                                                                                                                          |
|-------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------|
| active_ingredient | string                                                                                                                                                        |
| url               | string                                                                                                                                                        |
| pesticide_type    | string                                                                                                                                                        |
| information       | [{acute_toxicity: " ", long_term_toxicity: " ", water_quality_rating: " ", impact_on_natural_enemies: " ", impact_on_honey_bees: " ", associated_pests: " "}] |

Example of the single JSON data entry:
```json
{
    "active_ingredient": "Potassium bicarbonate",
    "url": "http://ipm.ucanr.edu/TOOLS/PNAI/pnaishow.php?id=60?src=exchbt",
    "pesticide_type": "fungicide",
    "information": [
        {
            "acute_toxicity": "Toxicity rating: No information",
            "long_term_toxicity": "On US EPA list: Not listed; On CA Proposition list: Not listed",
            "water_quality_rating": "Overall runoff risk rating: No information Notes: No information available",
            "impact_on_natural_enemies": "Overall toxicity rating: No information",
            "impact_on_honey_bees": "Toxicity category: No information",
            "associated_pests": "Powdery Mildew on Ornamentals, Roses in the Garden and Landscape: Diseases and Abiotic Disorders, powdery mildew"
        }
    ]
}
```
### Metadata on data source

Information on pesticide control remedies with details.

## Weed Items

In [None]:
FILE_NAME = 'WeedIdItems.json'
df = pd.read_json(_PATH + FILE_NAME)
df.info()

In [None]:
print('Field with less than 50% non-null entries:')
for c in df.columns:
    if df[df[c].str.len() > 0].shape[0] / df[c].shape[0] < 0.5:
        print(f'{c:<20} - {(df[df[c].str.len() > 0].shape[0] / df[c].shape[0]) * 100:.0f}%')

In [None]:
df.sample(5)

In [None]:
text_columns = ['name','description', 'images']
fig, axes = plt.subplots(1, 3, figsize = (30, 5))
for i, col in enumerate(text_columns):
    df[col].apply(len).hist(figure = fig, bins = 30, ax = axes[i])
    axes[i].set_title(col, fontdict = {'fontsize': 20})
plt.show()

### IPM Data - `WeedIdItems.json`

| column      | type                                       |
|-------------|--------------------------------------------|
| name        | string                                     |
| url         | string                                     |
| description | string                                     |
| images      | [{src: " ", caption: " "}]                 |

Example of the single JSON data entry:
```json
{
    "name": "Growth habit",
    "url": "http://ipm.ucanr.edu/PMG/WEEDS/ID/broadhabit.html?src=exchbt",
    "description": "Broadleaves may grow prostrate and form a mat or they may grow upright.",
    "images": [
        {
            "src": "http://ipm.ucanr.edu/TOOLS/TURF/IMAGES/PESTMANIM/prostrate_upright.jpg?src=exchbt",
            "caption": "A plant growing prostrate and upright"
        }
    ]
}
```
### Metadata on data source

Information on weed pests.

## Final Transformation



### Fruit and Veggie Cultural

```json
{
    "name": "Planting cucurbits",                                                                   # title
    "url": "http://ipm.ucanr.edu/PMG/GARDEN/VEGES/CULTURAL/cantaloupeplant.html?src=exchbt",        # url
    "description": "Cucurbits can be seeded directly or transplanted into the garden. It is ...",   # description
    "images": [                                                                                     # links (type - images)
        {
            "src": "http://ipm.ucanr.edu/PMG/GARDEN/IMAGES/CULTURAL/transplant.jpg?src=exchbt",
            "caption": "Transplanting"
        }
    ],
    "tips_table": [                                                                                 # omit for now
        {
            "header": "Planting tips"
        },
        {
            "row": "Distance in inches "
        },
        {
            "row": "Between plants in rows Between rows "
        },
        {
            "row": "Cucumbers 24 48 "
        },
        {
            "row": "Melons 12 72 "
        },
        {
            "row": "Pumpkins 48 72 "
        },
        {
            "row": "Squash 48 48-72"
        }
    ]
}
```
### Garden Pest Control
```json
{
    "name": "Parasites",                                                                                                        # title
    "url": "http://ipm.ucanr.edu/PMG/GARDEN/CONTROLS/parasites.html?src=exchbt",                                                # url
    "description": "Insect parasites (parasitoids) are smaller than their hosts and develop inside, or attached to the ...",    # description
    "images": [                                                                                                                 # links (type - images)
        {
            "link": "http://ipm.ucanr.edu/PMG/S/I-LP-SCON-AS.001.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/S/I-LP-SCON-AS.001a.jpg?src=exchbt",
            "caption": "Apanteles cocoons"
        },
        {
            "link": "",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/PESTICIDES/parasitelifecycle.jpg?src=exchbt",
            "caption": "Life cycle of a Hyposoter parasite"
        },
        {
            "link": "http://ipm.ucanr.edu/PMG/S/I-LP-SCON-HF.002.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/S/I-LP-SCON-HF.002a.jpg?src=exchbt",
            "caption": "Redhumped caterpillars parasitized by Hyposoter wasps"
        }
    ]
}
```
### Garden Pesticide Control
```json
{
    "active_ingredient": "Potassium bicarbonate",                                                                       # title
    "url": "http://ipm.ucanr.edu/TOOLS/PNAI/pnaishow.php?id=60?src=exchbt",                                             # url
    "pesticide_type": "fungicide",                                                                                      # merge with title
    "information": [                                                                                                    # for now simply take `associated_pests` as description
        {
            "acute_toxicity": "Toxicity rating: No information",
            "long_term_toxicity": "On US EPA list: Not listed; On CA Proposition list: Not listed",
            "water_quality_rating": "Overall runoff risk rating: No information Notes: No information available",
            "impact_on_natural_enemies": "Overall toxicity rating: No information",
            "impact_on_honey_bees": "Toxicity category: No information",
            "associated_pests": "Powdery Mildew on Ornamentals, Roses in the Garden and Landscape: Diseases and ..."    # description
        }
    ]
}
```
### Weed Pests
```json
{
    "name": "Growth habit",                                                                                 # title
    "url": "http://ipm.ucanr.edu/PMG/WEEDS/ID/broadhabit.html?src=exchbt",                                  # url
    "description": "Broadleaves may grow prostrate and form a mat or they may grow upright.",               # description
    "images": [                                                                                             # links (type - images)
        {
            "src": "http://ipm.ucanr.edu/TOOLS/TURF/IMAGES/PESTMANIM/prostrate_upright.jpg?src=exchbt",
            "caption": "A plant growing prostrate and upright"
        }
    ]
}
`
```


### Final mapping

`Combined` index:
```json
{
    "source"        : "infoFruitVegCultural/infoPestControl/infoPesticideControl/pestsWeed",
    "name"          : "text",
    "url"           : "url",
    "description"   : "text",
    "links": [
        {
            "type"      : "images",
            "title"     : "...",
            "src"       : "urlSource",
            "link"      : "urlAdditional"
        },
        ...
    ]
}
```
    # columns from IPM Dec2021
    # identification
    # development
    # damage
    # management

    # columns from AskExtension
    # source    : askextension
    # title     : name
    # question  : description
    # tags      : links (type - tag)
    # answers   : links (type - answer)
}
```

## ETL of data

In [None]:
cols = ['source', 'url', 'title', 'description', 'identification', 'development', 'damage', 'management', 'links']

def infoFruitVegCultural():
    # -------------------------------------------- Fruit and veggie cultural tips
    print(f'Merging fruit and veggie cultural tips..')
    FILE_NAME = 'FruitVegCulturalItems.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url description images tips_table
    final schema:
    source url title description identification development damage management links
    '''

    df['source']= 'infoFruitVegCultural'

    df.rename(columns = {
        'name'  : 'title',
        'images': 'links'
    }, inplace = True)

    df['links'] = df['links'].apply(lambda l: [
        {
            'type'  : 'image'       , 
            'src'   : i['src']      , 
            'link'  : ''            ,
            'title' : i['caption']  
        } for i in l])
    
    df['identification' ] = ''
    df['development'    ] = ''
    df['damage'         ] = ''
    df['management'     ] = ''

    df = df[cols]
    
    return df

df      = infoFruitVegCultural()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def infoPestControl():
    # -------------------------------------------- Garden pest control
    print(f'Merging garden pest control information...')
    FILE_NAME = 'GardenControlsPestItems.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url description images
    final schema:
    source url title description identification development damage management links
    '''

    df['source']= 'infoPestControl'

    df.rename(columns = {
        'name'  : 'title',
        'images': 'links'
    }, inplace = True)

    df['links'] = df['links'].apply(lambda d: d if isinstance(d, list) else [])

    df['links'] = df['links'].apply(lambda l: [
        {
            'type'  : 'image'               , 
            'src'   : i.get('src'       , ''), 
            'link'  : i.get('link'      , ''),
            'title' : i.get('caption'   , '')
        } for i in l])
    
    df['identification' ] = ''
    df['development'    ] = ''
    df['damage'         ] = ''
    df['management'     ] = ''

    df = df[cols]
    
    return df

df      = infoPestControl()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def infoPesticideControl():
    # -------------------------------------------- Garden pesticide control
    print(f'Merging garden pesticide control information...')
    FILE_NAME = 'GardenControlsPesticideItems.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    active_ingredient url pesticide_type information
    final schema:
    source url title description identification development damage management links
    '''

    df['source'     ] = 'infoPesticideControl'
    df['title'      ] = df[['active_ingredient', 'pesticide_type']].agg(' - '.join, axis=1)
    df['description'] = df['information'].str[0].apply(lambda x: x['associated_pests'])
    df['links'      ] = [[] for _ in range(len(df))]
    
    df['identification' ] = ''
    df['development'    ] = ''
    df['damage'         ] = ''
    df['management'     ] = ''


    df = df[cols]

    return df

df      = infoPesticideControl()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')


def pestsWeed():
    # -------------------------------------------- Weed related pests
    print(f'Merging weed related pests...')
    FILE_NAME = 'WeedIdItems.json'
    df = pd.read_json(_PATH + FILE_NAME)
    '''
    columns in source:
    name url description images
    final schema:
    source url title description identification development damage management links
    '''

    df['source'] = 'pestsWeed'

    df.rename(columns = {
        'name'  : 'title',
        'images': 'links'
    }, inplace = True)

    # df['links'] = df['links'].apply(lambda d: d if isinstance(d, list) else [])

    df['links'] = df['links'].apply(lambda l: [
        {
            'type'  : 'image'       , 
            'src'   : i['src']      , 
            'link'  : ''            ,
            'title' : i['caption']  
        } for i in l])

    df['identification' ] = ''
    df['development'    ] = ''
    df['damage'         ] = ''
    df['management'     ] = ''

    df = df[cols]

    return df

df      = pestsWeed()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')

def _clean(text):
    '''
    Fix encodings and remove escape and redundant whitespace characters from text.
    '''
    text = text.encode('ascii', 'ignore').decode()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

colsVector = ['title', 'description']
for c in colsVector:
    finalDf[c] = finalDf[c].apply(_clean)

print(f'Fix encodings and remove escape and redundant whitespace characters from text.')
print(f'------------------------------------------------')

print(f'Final dataframe shape: {finalDf.shape    }')
print(f'FINISHED')

finalDf.sample(5)

# UC IPM YouTube data

In [None]:
PATH = '../data/uc-ipm/youtube/videos.json'
df = pd.read_json(PATH)
df.sample(5)

## Final Transformation

```json
[
  {
    "title": "Urban pesticide mitigation",                                                                  # title
    "url": "https://www.youtube.com/watch?v=YchTkICHVjY&list=PLo3rG4iqv4gHjjR2KRoesoN7yRy94YlbM&index=6",   # url
    "description": "Non-toxic environmentally friendly alternatives are available for managing most ..."    # description
  },
  ...
]
```

## ETL of data

In [None]:
cols = ['source', 'url', 'title', 'description', 'identification', 'development', 'damage', 'management', 'links']

def getYoutubeData():
    print(f'Merging UC IPM YouTube data...')

    PATH = '../data/uc-ipm/youtube/videos.json'
    df = pd.read_json(PATH)

    df['source'] = 'youtube'

    df['identification' ] = ''
    df['development'    ] = ''
    df['damage'         ] = ''
    df['management'     ] = ''

    df['links'          ] = [[] for _ in range(len(df))]

    df = df[cols]

    return df

df      = getYoutubeData()
finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape    }')

finalDf.sample(5)

# AskExtension Data

In [None]:
import json

PATH = '../data/askextension/2020-08-20/'
FILE_NAMES = [PATH + f for f in sorted(os.listdir(PATH))]

with open(FILE_NAMES[0]) as f:
    f = json.load(f)
    print(json.dumps(f[0], indent = 2))

## Data

Data constists 4 files:
- 2012-2014.json
- 2014-2016.json
- 2016-2018.json
- 2018-2020.json

It is a list of dictionary objects with following fields:
- `faq-id` - ID of the ticket
- `title` - title of the ticket along ID of the ticket (__other__ ID)
- `created` - ticket creating date
- `updated` - ticket last update date
- `tags` - list of tags
- `state` - state ticket was created in
- `county` - county ticket was created in
- `question` - question that has been posted
- `answer` - response lists presented in numbered dictionary data type

## Final mapping

`Merged` index:
```json
{
    "source"        : "askExtension",
    "name"          : "text",           # title
    "url"           : "url",            # url
    "description"   : "text",           # title_question
    "links": [
        {
            "type"      : "tag/answer",
            "title"     : "...",
            "src"       : "urlSource",
            "link"      : "urlAdditional"
        },
        ...
    ]
    # columns from AskExtension
    # identification
    # development
    # damage
    # management
}
```

## ETL

In [None]:
import sys
import re

from string import punctuation as pn

# Modify STATE_FILTER and MIN_WORD_COUNT variables accordingly
STATE_FILTER    = ['California']
MIN_WORD_COUNT  = 3

ASKEXTENSION_QUESTION_URL = 'https://ask2.extension.org/kb/faq.php?id='

# Combines the data files into one and returns it.
df = pd.DataFrame()
for f in FILE_NAMES:
    df = pd.concat([df, pd.read_json(f)], ignore_index = True, axis = 0)

df['source'] = 'askExtension'

# Convert 'faq-id' to str type
df['faq-id'] = df['faq-id'].astype(str)

# Leave tickets from California state
df = df[df['state'].isin(STATE_FILTER)]

# Add the URL and leave blank URL for questions with no ID
df['url'] = [
    f"{ASKEXTENSION_QUESTION_URL}{ticket_no}" if len(ticket_no) == 6 else ""
    for ticket_no in df['title'].str.split('#').str[-1]
]

# Add the ticket number from title and leave blank for questions without
df['ticket-no'] = [
    ticket_no if len(ticket_no) == 6 else ""
    for ticket_no in df['title'].str.split('#').str[-1]
]

df.rename(columns = {'faq-id': 'faq_id', 'ticket-no': 'ticket_no'}, inplace = True)

def _clean(text):
    '''
    Fix encodings and remove escape and redundant whitespace characters from text.

    Examples with non-ascii characters - 110358, 147160
    Examples with redundant whitespace - 117069, 127760

    See: https://stackoverflow.com/a/53821967/5480536
    '''
    text = text.encode('ascii', 'ignore').decode()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def _transform_answer(answer_dict):
    '''
    Convert answer field from a dictionary to a list.
    '''
    answers = [{}] * len(answer_dict)
    
    for k, v in answer_dict.items():
        # clean the response up
        v = {
            'type'  : 'answer'  ,
            'src'   : ''        , 
            'link'  : ''        ,
            'title' : _clean(v['response']),
        }
        answers[int(k) - 1] = v
    
    return answers

# Transform answer for consistency with IPM data
df['links'] = df['answer'].apply(_transform_answer)

# Strip all spaces and remove non-ascii characters from text fields
for column in ['state', 'title', 'question']:
    df[column] = df[column].apply(_clean)

def _transform_title(title):
    '''
    Remove question ID from title, and append '.' in the end
    if no punctuation was detected.

    Example with '#' - 437259
    Example with '...' - 437264
    '''
    title = ''.join(title.split('#')[:-1]).strip().strip('...')
    
    # add a '.' if it does not yet end with a punctuation
    title = title if (title and title[-1] in pn) else title + '.'
    
    return title

# Clean ID and '...' from title, and append punctuation if not present
df['title'] = df['title'].apply(_transform_title)

def _merge_title_question(df):
    '''
    Create new column from questions and title,
    but only if it is not already exactly in the question.
    '''
    titles      = df['title'    ].tolist()
    questions   = df['question' ].tolist()
    
    tqs = [
        question
        if (title and question.startswith(title[:-1]))
        else title + " " + question
        for (title, question) in zip(titles, questions)
    ]

    return tqs

# Create new column from `title` and `question`, or only question
# if title is exactly the question     
df['description'] = _merge_title_question(df)
    
# Remove questions with small number words in title-question
if MIN_WORD_COUNT:
    df = df[df['description'].str.split().str.len() > MIN_WORD_COUNT]

df = df.loc[:, ['source', 'url', 'title', 'description', 'links']]
df.sample(5)

cols = ['source', 'url', 'title', 'description', 'identification', 'development', 'damage', 'management', 'links']
'''
columns in source:
source url name description links
final schema:
source url title description identification development damage management links    
'''
df['identification' ] = ''
df['development'    ] = ''
df['damage'         ] = ''
df['management'     ] = ''
df = df[cols]

finalDf = pd.concat([finalDf, df], ignore_index = True, axis = 0)
print(f'Final dataframe shape: {finalDf.shape}')
print(f'------------------------------------------------')

finalDf.sample(5)

# Embedding text fields into vectors and stripping text fields for saving into ES

In [None]:
import sys

sys.path.insert(1, os.path.realpath(os.path.pardir))

os.environ['STAGE'          ] = 'dev'
os.environ['ES_USERNAME'    ] = 'elastic'
os.environ['ES_PASSWORD'    ] = 'changeme'
os.environ['TF_CACHE_DIR'   ] = '/var/tmp/models'
## select the environment for population
# os.environ['ES_HOST'    ] = 'http://localhost:9200/'
# os.environ['ES_HOST'    ] = 'https://dev.es.chat.ask.eduworks.com/'
os.environ['ES_HOST'    ] = 'https://qa.es.chat.ask.eduworks.com/'

import config

In [None]:
import importlib
importlib.reload(config)

### Embedding by splitting text to `CHUNK_SIZE` sentences chunks

In [None]:
# ---------------------------------------- Dataframe embeddings - text fields
CHUNK_SIZE      = 1
ROLLING_SIZE    = 3
BATCH_SIZE      = 32768

finalDf['vectors'] = np.empty((len(finalDf), 0)).tolist()

colsVector = ['title', 'description', 'identification', 'development', 'damage', 'management']
print(f'Final DF: Embedding columns - {colsVector} and links titles.')

from spacy.lang.en import English 

nlp = English()
nlp.add_pipe('sentencizer')
raw_text = finalDf.iloc[0]['description']


for i, r in finalDf.iterrows():
    r_vectors = []
    for c in colsVector:
        t = r[c]
        
        doc = nlp(t)
        
        ts = [sent for sent in doc.sents]
        if len(ts) == 0:
            continue
        else:
            chunks, chunk_size, roll_size = len(ts), CHUNK_SIZE, ROLLING_SIZE
            ts = [ts[i1:i1+chunk_size+(roll_size - 1)] for i1 in range(0, chunks - (roll_size - 1), chunk_size)]
            ts = [{'text': ' '.join([l2.text for l2 in l1]), 'start': l1[0].start_char, 'end': l1[-1].end_char} for l1 in ts]
        
        # TF HUB model - USE
        # c_vectors   = config.embed([t['text'] for t in ts]).numpy().tolist()
        
        # Sentence Encoder model - paraphrase-MiniLM-L6-v2
        c_vectors   = config.embed.encode(
            sentences           = [t['text'] for t in ts]   ,
            batch_size          = BATCH_SIZE                ,
            show_progress_bar   = False
        ).tolist()
        
        for i1, v in enumerate(c_vectors):
            r_vectors.append({'vector': v, 'name': c + '_' + str(i1), 'start': ts[i1]['start'], 'end': ts[i1]['end']})
    
    ts = [r['title'] + ' - ' + i1['title'] for i1 in r['links']]
    if len(ts) == 0:
        r['vectors'] = r_vectors
        if (i+1) % 500 == 0:
            print(f'Finished embedding of {i+1} rows of dataframe')
        continue

    # TF HUB model - USE
    # l_vectors = config.embed(ts).numpy().tolist()
    
    # Sentence Encoder model - paraphrase-MiniLM-L6-v2
    l_vectors = config.embed.encode(
        sentences           = ts        ,
        batch_size          = BATCH_SIZE,
        show_progress_bar   = False
    ).tolist()
    
    for i1, v in enumerate(l_vectors):
        r_vectors.append({'vector': v, 'name': 'links_' + str(i1), 'start': 0, 'end': -1})
    
    r['vectors'] = r_vectors

    if (i+1) % 500 == 0:
        print(f'Finished embedding of {i+1} rows of dataframe')

print(f'The number of vectors to be ingested: {len([r1["vector"] for r in finalDf["vectors"] for r1 in r])}')        
finalDf.sample(5)

#### Sentence chunks

Chunk size and number of sentence:

| Sentence Chunk Size (for embedding) | Number of vectors |
|-------------------------------------|-------------------|
| 1                                   | 66444             |
| 3                                   | 34724             |
| 1 (rolling 2 sentences)             | 57262             |
| 1 (rolling 3 sentences)             | 52092             |



## Ingesting data into ES

In [None]:
# embedding size for model USE
# VECTOR_SIZE = 512

# embedding size for model paraphrase-MiniLM-L6-v2 
VECTOR_SIZE = 384

mapping  = {
    "settings": {"number_of_shards": 2, "number_of_replicas": 1},
    "mappings": {
        "dynamic"   : "false",
        "_source"   : {"enabled": "true"},
        "properties": {
            "source"        : {"type": "keyword", "index": "true" , "ignore_above": 32766},
            "url"           : {"type": "keyword", "index": "false", "ignore_above": 32766},

            "title"         : {"type": "keyword", "index": "false", "ignore_above": 32766},
            "description"   : {"type": "keyword", "index": "false", "ignore_above": 32766},
            "identification": {"type": "keyword", "index": "false", "ignore_above": 32766},
            "development"   : {"type": "keyword", "index": "false", "ignore_above": 32766},
            "damage"        : {"type": "keyword", "index": "false", "ignore_above": 32766},
            "management"    : {"type": "keyword", "index": "false", "ignore_above": 32766},
            "vectors"       : {
                "type"      : "nested",
                "properties": {
                    "vector": {
                        "type": "dense_vector", 
                        "dims": VECTOR_SIZE
                    },
                    "name"  : {"type": "keyword", "index": "false", "ignore_above": 32766},
                    "start" : {"type": "integer"                                         },
                    "end"   : {"type": "integer"                                         },
                }
            },
            
            "links"         : {
                "type"      : "nested",
                "properties": {
                    "type"  : {"type": "keyword", "index": "false", "ignore_above": 32766},
                    "src"   : {"type": "keyword", "index": "false", "ignore_above": 32766},
                    "link"  : {"type": "keyword", "index": "false", "ignore_above": 32766},
                    "title" : {"type": "keyword", "index": "false", "ignore_above": 32766}
                }
            }
        }
    }
}

final_json = finalDf.to_dict('records')

In [None]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk

from collections import deque

# increase the timeout if necessary
es_client = Elasticsearch([config.es_host], http_auth=(config.es_username, config.es_password), timeout = 20)

es_client.indices.delete(
    index   = config.es_combined_index, 
    ignore  = 404)
es_client.indices.create(
    index       = config.es_combined_index  , 
    settings    = mapping['settings']       , 
    mappings    = mapping['mappings']       )
# play with chunk size parameter for timed out problem
deque(parallel_bulk(es_client, actions = final_json, index = config.es_combined_index, max_chunk_bytes = 5 * 1024 * 1024), maxlen = 0)

es_client.indices.refresh()