# EDA of data

In [None]:
import json

import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path

COLOR = 'white'
plt.rcParams['text.color'       ] = COLOR
plt.rcParams['text.color'       ] = COLOR
plt.rcParams['axes.labelcolor'  ] = COLOR
plt.rcParams['xtick.color'      ] = COLOR
plt.rcParams['ytick.color'      ] = COLOR

## UC IPM data - December 2021 Scrape

In [None]:
_PATH = Path('../data/uc-ipm/scrape_cleaned_Dec2021/')
DATA_FILE_NAMES = sorted(_PATH.iterdir())
[data_file.name for data_file in DATA_FILE_NAMES]

The list of files should be as following:
```python
['exoticPests.json',
 'fruitItems_new.json',
 'fruitVeggieEnvironItems_new.json',
 'pestDiseaseItems_new.json',
 'plantFlowerItems.json',
 'turfPests.json',
 'veggieItems_new.json',
 'weedItems.json']
```

The corresponding EDA for these sources (links):
* [`exoticPests.json`](#exoticpestsjson)
* [`fruitItems_new.json`](#fruititems_newjson)
* [`fruitVeggieEnvironItems_new.json`](#fruitveggieenvironitems_newjson)
* [`pestDiseaseItems_new.json`](#pestdiseaseitems_newjson)
* [`plantFlowerItems.json`](#plantfloweritemsjson)
* [`turfPests.json`](#turfpestsjson)
* [`veggieItems_new.json`](#veggieitems_newjson)
* [`weedItems.json`](#weeditemsjson)

### Pests - exotic types
<a id='exoticpestsjson'></a>

In [None]:
FILE_NAME = 'exoticPests.json'
df = pd.read_json(Path.joinpath(_PATH, FILE_NAME))
df.info()

In [None]:
print('Field with less than 50% non-null entries:')
for c in df.columns:
    if df[df[c].str.len() > 0].shape[0] / df[c].shape[0] < 0.5:
        print(f'{c:<20} - {(df[df[c].str.len() > 0].shape[0] / df[c].shape[0]) * 100:.0f}%')

In [None]:
df.sample(5)

In [None]:
text_columns = ['name', 'description', 'damage', 'identification', 'life_cycle', 'monitoring', 'management']
fig, axes = plt.subplots(3, 3, figsize = (30, 15))
for i, col in enumerate(text_columns):
    r = i//3
    c = i%3
    df[col].apply(len).hist(figure = fig, bins = 20, ax = axes[r][c])
    axes[r][c].set_title(col, fontdict = {'fontsize': 20})
plt.show()

__Exotic pests - `exoticPests.json`__

| column         | type                     |
|----------------|--------------------------|
| name           | string                   |
| url            | string                   |
| description    | string                   |
| damage         | string                   |
| identification | string                   |
| life_cycle     | string                   |
| monitoring     | string                   |
| management     | string                   |
| related_links  | [{text: " ", link: " "}] |

Example of the single JSON data entry:
```json
{
    "name": "European Grapevine Moth",
    "url": "https://www2.ipm.ucanr.edu/Invasive-and-Exotic-Pests/European-grapevine-moth/?src=exchbt",
    "description": "Lobesia botrana, the European grapevine moth, was first reported in the United ...",
    "damage": "In May and June, first-generation larvae web and feed on the flower clusters. Secon...",
    "identification": "The adult moth is approximately 0.24 to 0.3 inch (6-8 mm) long, with a wing...",
    "life_cycle": "European grapevine moth has two generations in its life cycle in northern Europ...",
    "monitoring": "Sex pheromone attracts males and is used to monitor male flights. Before bud br...",
    "management": "In countries where L. botrana is established, control measures are targeted at ...",
    "related_links": [
        {
        "text": "Grape pest management guidelines",
        "link": "http://ipm.ucanr.edu/PMG/selectnewpest.grapes.html?src=exchbt"
        },
        {
        "text": "Video presentation",
        "link": "http://stream.ucanr.org/ipm_ag_urban/evgm2011/?src=exchbt"
        }
    ],
    "images": [
        {
        "link": "http://ipm.ucanr.edu/PMG/L/I-LP-LBOT-AD.002.html?src=exchbt",
        "src": "http://ipm.ucanr.edu/PMG/IMAGES/L/I-LP-LBOT-AD.002h.jpg?src=exchbt",
        "caption": "Adult female European grapevine moth."
        },
        {
        "link": "http://ipm.ucanr.edu/PMG/L/I-LP-LBOT-CD.004.html?src=exchbt",
        "src": "http://ipm.ucanr.edu/PMG/IMAGES/L/I-LP-LBOT-CD.004h.jpg?src=exchbt",
        "caption": "Grape bunches with webbing, frass, and fungal infections."
        },
        ...
    ]
}
```

__Metadata on data source__

Information on exotic pests.

Notes:
* `description`, `damage`, `identification`, `life_cycle`, `monitoring`, `management` - main fields.
* `related_links/text`, `images/caption` - additional fields.
    
Out of 15-20 source, only 2 have description field filled.

### Information - fruits
<a id='fruititems_newjson'></a>

In [None]:
FILE_NAME = 'fruitItems_new.json'
df = pd.read_json(Path.joinpath(_PATH, FILE_NAME))
df.info()

In [None]:
print('Field with less than 50% non-null entries:')
for c in df.columns:
    if df[df[c].str.len() > 0].shape[0] / df[c].shape[0] < 0.5:
        print(f'{c:<20} - {(df[df[c].str.len() > 0].shape[0] / df[c].shape[0]) * 100:.0f}%')

In [None]:
df.sample(5)

__Fruits - `fruitItems_new.json`__

| column              | type                      |
|---------------------|---------------------------|
| name                | string                    |
| url                 | string                    |
| cultural_tips       | [{tip: "", link: ""}]     |
| pests_and_disorders | [{problem: "", link: ""}] |

Example of the single JSON data entry:
```json
{
    "name": "Figs",
    "url": "http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/figs.html?src=exchbt",
    "cultural_tips": [
        {
            "tip": "Fertilizing",
            "link": "http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/CULTURAL/fgfertilizing.html?src=exchbt"
        },
        {
            "tip": "First-year pruning",
            "link": "http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/CULTURAL/almondfirst.html?src=exchbt"
        },
        ...
    ],
    "pests_and_disorders": [
        {
            "problem": "Ants",
            "link": "http://ipm.ucanr.edu/PMG/PESTNOTES/pn7411.html?src=exchbt"
        },
        {
            "problem": "Carpenterworm",
            "link": "http://ipm.ucanr.edu/PMG/PESTNOTES/pn74105.html?src=exchbt"
        },
        ...
    ]
}
```
__Metadata on data source__

Information on fruits. Name with cultural tips and problems can be used to search for the queries. Following slots can be used - `plant_name`, `problem`.

Notes:
* Concatenate `name` and `cultural_tips/tip` - additional field.
* Concatenate `name` and `pests_and_disorders/problem` - additional field.

### Environmental damages - fruits and veggies
<a id='fruitveggieenvironitems_newjson'></a>

In [None]:
FILE_NAME = 'fruitVeggieEnvironItems_new.json'
df = pd.read_json(Path.joinpath(_PATH, FILE_NAME))
df.info()

In [None]:
print('Field with less than 50% non-null entries:')
for c in df.columns:
    if df[df[c].str.len() > 0].shape[0] / df[c].shape[0] < 0.5:
        print(f'{c:<20} - {(df[df[c].str.len() > 0].shape[0] / df[c].shape[0]) * 100:.0f}%')

In [None]:
df.sample(5)

In [None]:
text_columns = ['name', 'description', 'identification', 'damage', 'disorder_development', 'solutions']
fig, axes = plt.subplots(2, 3, figsize = (30, 10))
for i, col in enumerate(text_columns):
    r = i//3
    c = i%3
    df[col].apply(len).hist(figure = fig, bins = 20, ax = axes[r][c])
    axes[r][c].set_title(col, fontdict = {'fontsize': 20})
plt.show()

__Environment Fruit and Veggie - `fruitVeggieEnvironItems_new.json`__

| column               | type                                  |
|----------------------|---------------------------------------|
| name                 | string                                |
| url                  | string                                |
| description          | string                                |
| identification       | string                                |
| damage               | string                                |
| disorder_development | string                                |
| solutions            | string                                |
| images               | [{link: " ", src: " ", caption: " "}] |

Example of the single JSON data entry:
```json
{
    "name": "Wind",
    "url": "http://ipm.ucanr.edu/PMG/GARDEN/ENVIRON/wind.html?src=exchbt",
    "description": "Wind can damage bark, flowers, foliage, fruit, and limbs of most any...",
    "identification": "Plants growing at windy sites often have smaller-than-normal leaves...",
    "damage": "Wind-damaged leaves become necrotic along the margins and tips and drop prema...",
    "disorder_development": "Wind commonly causes water deficit. If soil moisture is low, or w...",
    "solutions": "Provide plants with proper cultural care, especially appropriate irrigation...",
    "images": [
        {
            "link": "http://ipm.ucanr.edu/PMG/W/A-WO-WEAT-FS.002.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/W/A-WO-WEAT-FS.002a.jpg?src=exchbt",
            "caption": "Wind-sculptured cypress"
        },
        {
            "link": "http://ipm.ucanr.edu/PMG/W/A-WO-WEAT-FO.041.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/W/A-WO-WEAT-FO.041a.jpg?src=exchbt",
            "caption": "Necrotic, wind-tattered leaves"
        },
        ...
    ]
}
```

__Metadata on data source__

Environment caused problems on veggies and fruites. `problem_description` can be used to find matches.

Notes:
* `name`, `description`, `identification`, `damage`, `disorder_development`, `solutions` - main fields
* `images/caption` - additional fields

### Pests - causing diseases
<a id='pestdiseaseitems_newjson'></a>

In [None]:
FILE_NAME = 'pestDiseaseItems_new.json'
df = pd.read_json(Path.joinpath(_PATH, FILE_NAME))
df.info()

In [None]:
print('Field with less than 50% non-null entries:')
for c in df.columns:
    if df[df[c].str.len() > 0].shape[0] / df[c].shape[0] < 0.5:
        print(f'{c:<20} - {(df[df[c].str.len() > 0].shape[0] / df[c].shape[0]) * 100:.0f}%')

In [None]:
df.sample(5)

In [None]:
text_columns = ['name', 'description', 'identification', 'life_cycle', 'damage', 'solutions']
fig, axes = plt.subplots(2, 3, figsize = (30, 10))
for i, col in enumerate(text_columns):
    r = i//3
    c = i%3
    df[col].apply(len).hist(figure = fig, bins = 20, ax = axes[r][c])
    axes[r][c].set_title(col, fontdict = {'fontsize': 20})
plt.show()

__Pest Diseases - `pestDiseasesItems_new.json`__

| column         | type                                  |
|----------------|---------------------------------------|
| name           | string                                |
| url            | string                                |
| description    | string                                |
| identification | string                                |
| life_cycle     | string                                |
| damage         | string                                |
| solutions      | string                                |
| images         | [{link: " ", src: " ", caption: " "}] |


Example of the single JSON data entry:

```json
{
    "name": "Stink bugs",
    "url": "http://ipm.ucanr.edu/PMG/GARDEN/VEGES/PESTS/stinkbug.html?src=exchbt",
    "description": "These sucking insects (family Pentatomidae) are shield shaped. Their common ...",
    "identification": "Adult stink bugs are distinguished from other insects by the large ...",
    "life_cycle": "Stink bugs develop through three life stages: egg, nymph, and adult. ...",
    "damage": "Adults suck and feed on plants with their strawlike mouthparts. Stink bugs ...",
    "solutions": "Handpick the bugs and their eggs from small plants. Eliminate groundcovers...",
    "images": [
        {
            "link": "http://ipm.ucanr.edu/PMG/E/I-HM-ECON-AD.015.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/E/I-HM-ECON-AD.015a.jpg?src=exchbt",
            "caption": "Adults of southern green stink bug (left), redshouldered stink bug, (upper right) and consperse stink bug."
        },
        {
            "link": "http://ipm.ucanr.edu/PMG/E/I-HM-ECON-EG.001.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/E/I-HM-ECON-EG.001b.jpg?src=exchbt",
            "caption": "First instars of consperse stink bug and their empty egg cases."
        },
        ...
    ]
},
```

__Metadata on data source__

Information on pests.

### Information - flowers
<a id='plantfloweritemsjson'></a>

In [None]:
FILE_NAME = 'plantFlowerItems.json'
df = pd.read_json(Path.joinpath(_PATH, FILE_NAME))
df.info()

In [None]:
print('Field with less than 50% non-null entries:')
for c in df.columns:
    if df[df[c].str.len() > 0].shape[0] / df[c].shape[0] < 0.5:
        print(f'{c:<20} - {(df[df[c].str.len() > 0].shape[0] / df[c].shape[0]) * 100:.0f}%')

In [None]:
df.sample(5)

In [None]:
text_columns = ['name', 'identification', 'optimum_conditions']
fig, axes = plt.subplots(1, 3, figsize = (30, 5))
for i, col in enumerate(text_columns):
    c = i%3
    df[col].apply(len).hist(figure = fig, bins = 20, ax = axes[c])
    axes[c].set_title(col, fontdict = {'fontsize': 20})
plt.show()

__Flowers - `plantFlowerItems.json`__

| column              | type                        |
|---------------------|-----------------------------|
| name                | string                      |
| url                 | string                      |
| identification      | string                      |
| optimum_conditions  | string                      |
| pests_and_disorders | [{problem: " ", link: " "}] |

Example of the single JSON data entry:
```json
{
    "name": "Abelia",
    "url": "http://ipm.ucanr.edu/PMG/GARDEN/PLANTS/abelia.html?src=exchbt",
    "identification": "Abelias are evergreen or semievergreen deciduous shrubs. Leaves are ...",
    "optimum_conditions": "Abelias can be planted as borders or barriers. Varieties that ...",
    "pests_and_disorders": [
        {
            "problem": "Root knot nematodes",
            "link": "http://ipm.ucanr.edu/PMG/PESTNOTES/pn7489.html?src=exchbt"
        },
        {
            "problem": "Mineral deficiencies",
            "link": "http://ipm.ucanr.edu/PMG/GARDEN/ENVIRON/mineraldef.html?src=exchbt"
        },
        ...
    ],
    "images": [
        {
            "link": "http://ipm.ucanr.edu/PMG/C/S-WO-CAPR-FL.017.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/C/S-WO-CAPR-FL.017a.jpg?src=exchbt",
            "caption": "Abelia flowers"
        },
        {
            "link": "http://ipm.ucanr.edu/PMG/C/S-WO-CAPR-FO.011.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/C/S-WO-CAPR-FO.011a.jpg?src=exchbt",
            "caption": "LEaves of Abelia"
        }
    ]
}
```
__Metadata on data source__

Description of the flowers.

* `name` and `pest_and_disorders` - main fields
* `images/caption` - additional fields


### Pests - turf (grass)
<a id='plantfloweritemsjson'></a>

In [None]:
FILE_NAME = 'turfPests.json'
df = pd.read_json(Path.joinpath(_PATH, FILE_NAME))
df.info()

In [None]:
print('Field with less than 50% non-null entries:')
for c in df.columns:
    if df[df[c].str.len() > 0].shape[0] / df[c].shape[0] < 0.5:
        print(f'{c:<20} - {(df[df[c].str.len() > 0].shape[0] / df[c].shape[0]) * 100:.0f}%')

In [None]:
df.sample(5)

In [None]:
text_columns = ['name', 'text']
fig, axes = plt.subplots(1, 2, figsize = (30, 5))
for i, col in enumerate(text_columns):
    c = i%2
    df[col].apply(len).hist(figure = fig, bins = 20, ax = axes[c])
    axes[c].set_title(col, fontdict = {'fontsize': 20})
plt.show()

__Turf Pests - `turfPests.json`__

| column | type                                  |
|--------|---------------------------------------|
| name   | string                                |
| url    | string                                |
| text   | string                                |
| images | [{link: " ", src: " ", caption: " "}] |

Example of the single JSON data entry:

```json
{
    "name": "Fiery skipper",
    "url": "http://ipm.ucanr.edu/TOOLS/TURF/PESTS/inskipper.html?src=exchbt",
    "text": "Identification Fiery skipper adults resemble butterflies and are 1 inch...",
    "images": [
        {
            "link": "http://ipm.ucanr.edu/PMG/H/I-LP-HPHY-AD.003.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/TOOLS/TURF/IMAGES/PESTMANIM/infieryad.jpg?src=exchbt",
            "caption": "Fiery skipper adult"
        },
        {
            "link": "http://ipm.ucanr.edu/PMG/H/I-LP-HPHY-LV.009.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/TOOLS/TURF/IMAGES/PESTMANIM/inskiplv.jpg?src=exchbt",
            "caption": "Skipper larva with dark head and thoracic shield"
        },
        ...
    ]
},
```
__Metadata on data source__

Inoformation on turf pests (grass).

### Information - veggies
<a id='veggieitems_newjson'></a>

In [None]:
FILE_NAME = 'veggieItems_new.json'
df = pd.read_json(Path.joinpath(_PATH, FILE_NAME))
df.info()

In [None]:
print('Field with less than 50% non-null entries:')
for c in df.columns:
    if df[df[c].str.len() > 0].shape[0] / df[c].shape[0] < 0.5:
        print(f'{c:<20} - {(df[df[c].str.len() > 0].shape[0] / df[c].shape[0]) * 100:.0f}%')

In [None]:
df.sample(5)

In [None]:
text_columns = ['name', 'description', 'tips']
fig, axes = plt.subplots(1, 3, figsize = (30, 5))
for i, col in enumerate(text_columns):
    c = i%3
    df[col].apply(len).hist(figure = fig, bins = 20, ax = axes[c])
    axes[c].set_title(col, fontdict = {'fontsize': 20})
plt.show()

__Veggie - `veggieItems_new.json`__

| column              | type                                  |
|---------------------|---------------------------------------|
| name                | string                                |
| url                 | string                                |
| description         | string                                |
| tips                | string                                |
| images              | [{link: " ", src: " ", caption: " "}] |
| pests_and_disorders | [{problem: "", link: ""}]             |

Example of the single JSON data entry:
```json
{
    "name": "Carrot ",
    "url": "http://ipm.ucanr.edu/home-and-landscape/carrot/index.html?src=exchbt",
    "description": "Carrots can be grown nearly year-round throughout California. A number of vari...",
    "tips": "Cultural practices such as proper site selection, soil preparation, planting, and wat...",
    "images": [
        {
            "link": "http://ipm.ucanr.edu/PMG/P/D-CA-PVIO-RO.002.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/P/D-CA-PVIO-RO.002.jpg?src=exchbt",
            "caption": "Cavity spot-induced, irregularly shaped lesions across mature carrot tap roots."
        },
        ...
    ],
    "pests_and_disorders": [
        {
            "problem": "Aphids",
            "link": "http://ipm.ucanr.edu/PMG/PESTNOTES/pn7404.html?src=exchbt"
        },
        {
            "problem": "Carrot rust fly",
            "link": "http://ipm.ucanr.edu/PMG/GARDEN/VEGES/PESTS/carrotrustfly.html?src=exchbt"
        },
        ...
    ]
}
```

__Metadata on data source__

Information on vegetables. `plant_name`, and `problem` can be used to search this data source.

Notes:
* `name`, `description`, `tips` - main fields.
* `images/caption` additional field.
* concatenate `name` + `pests_and_disorders` -  additional field.


### Environmental damages - weeds
<a id='weeditemsjson'></a>

In [None]:
FILE_NAME = 'weedItems.json'
df = pd.read_json(Path.joinpath(_PATH, FILE_NAME))
df.info()

In [None]:
print('Field with less than 50% non-null entries:')
for c in df.columns:
    if df[df[c].str.len() > 0].shape[0] / df[c].shape[0] < 0.5:
        print(f'{c:<20} - {(df[df[c].str.len() > 0].shape[0] / df[c].shape[0]) * 100:.0f}%')

In [None]:
df.sample(5)

In [None]:
text_columns = ['name', 'description']
fig, axes = plt.subplots(1, 2, figsize = (30, 5))
for i, col in enumerate(text_columns):
    c = i%2
    df[col].apply(len).hist(figure = fig, bins = 20, ax = axes[c])
    axes[c].set_title(col, fontdict = {'fontsize': 20})
plt.show()

__Weed - `weedItems.json`__

| column      | type                        |
|-------------|-----------------------------|
| name        | string                      |
| url         | string                      |
| description | string                      |
| images      | [{link: " ", caption: " "}] |

Example of the single JSON data entry:
```json
{
    "name": "Gregg arrowhead",
    "url": "http://ipm.ucanr.edu/PMG/WEEDS/gregg_arrowhead.html?src=exchbt",
    "description": "Gregg arrowhead is a native aquatic perennial that occurs in the...",
    "images": [
        {
            "link": "http://ipm.ucanr.edu/PMG/S/W-AL-SMON-MP.003.html?src=exchbt",
            "caption": "top picture"
        },
        {
            "link": "http://ipm.ucanr.edu/PMG/S/W-AL-SLON-SG.001.html?src=exchbt",
            "caption": "bottom left picture"
        },
        ...
    ]
}
```

__Metadata on data source__

Description of the weed (problematic). Weed is any plant growing in cultivated ground to the injury of the crop or desired vegetation.

### Final transformations

Data source will be mapped to following structure:
```json
{
    # mandatory fields
    "url"       : "url",                            # Main URL
    "source"    : "ucipm|aekb|okstate|orstate",     # Source Dataset
    "title"     : "title",                          # Title of data item
    ...
    # other fields
    ...
    "vectors"   : {
        "name"  : "field_name_and_index",           # Name of the field
        "start" : "number",                         # Start index within text
        "end"   : "number",                         # End index within text
        "vector": "dense_vector",                   # Embedding vector
}
```

Title itself will be encoded as well into vector field.

There is going to be single merged index consisting of the following:

__`exoticPests.json`__
```json
{
    "source": "ucipm",
    "name": "European Grapevine Moth",                                                                          # title field
    "url": "https://www2.ipm.ucanr.edu/Invasive-and-Exotic-Pests/European-grapevine-moth/?src=exchbt",          # url field
    "description": "Lobesia botrana, the European grapevine moth, was first reported in the United...",         # text field name - description
    "damage": "In May and June, first-generation larvae web and feed on the flower clusters. Secon...",         # text field name - damage
    "identification": "The adult moth is approximately 0.24 to 0.3 inch (6-8 mm) long, with a wing...",         # text field name - identification
    "life_cycle": "European grapevine moth has two generations in its life cycle in northern Europ...",         # text field name - life_cycle
    "monitoring": "Sex pheromone attracts males and is used to monitor male flights. Before bud br...",         # text field name - monitoring
    "management": "In countries where L. botrana is established, control measures are targeted at ...",         # text field name - management
    "related_links": [                                                                                          # text field name - related_links.title, vector - title + related_links.title
        {
        "text": "Grape pest management guidelines",                                                             # rename to title
        "link": "http://ipm.ucanr.edu/PMG/selectnewpest.grapes.html?src=exchbt"
        },
        {
        "text": "Video presentation",
        "link": "http://stream.ucanr.org/ipm_ag_urban/evgm2011/?src=exchbt"
        },
        ...
        ],
    "images": [                                                                                                 # text field name - images.title, vector - title + images.title
        {
        "link": "http://ipm.ucanr.edu/PMG/L/I-LP-LBOT-AD.002.html?src=exchbt",
        "src": "http://ipm.ucanr.edu/PMG/IMAGES/L/I-LP-LBOT-AD.002h.jpg?src=exchbt",
        "caption": "Adult female European grapevine moth."                                                      # rename to title
        },
        {
        "link": "http://ipm.ucanr.edu/PMG/L/I-LP-LBOT-CD.004.html?src=exchbt",
        "src": "http://ipm.ucanr.edu/PMG/IMAGES/L/I-LP-LBOT-CD.004h.jpg?src=exchbt",
        "caption": "Grape bunches with webbing, frass, and fungal infections."
        },
        ...
    ]
}
```

__`fruitItems_new.json`__
```json
{
    "source": "ucipm",
    "name": "Figs",                                                                                             # title field - title
    "url": "http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/figs.html?src=exchbt",                                        # url field - url
    "cultural_tips": [                                                                                          # text field name - cultural_tips.title, vector - title + cultural_tips.title
        {
            "tip": "Fertilizing",                                                                               # rename to title
            "link": "http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/CULTURAL/fgfertilizing.html?src=exchbt"
        },
        {
            "tip": "First-year pruning",
            "link": "http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/CULTURAL/almondfirst.html?src=exchbt"
        },
        ...
    ],
    "pests_and_disorders": [                                                                                    # text field name - pests_and_disorders.title, vector - title + pests_and_disorders.title
        {
            "problem": "Ants",                                                                                  # rename to title
            "link": "http://ipm.ucanr.edu/PMG/PESTNOTES/pn7411.html?src=exchbt"
        },
        {
            "problem": "Carpenterworm",
            "link": "http://ipm.ucanr.edu/PMG/PESTNOTES/pn74105.html?src=exchbt"
        },
        ...
    ]
}
```

__`fruitVeggieEnvironItems_new.json`__
```json
{
    "source": "ucipm",
    "name": "Wind",                                                                                             # title field
    "url": "http://ipm.ucanr.edu/PMG/GARDEN/ENVIRON/wind.html?src=exchbt",                                      # url field
    "description": "Wind can damage bark, flowers, foliage, fruit, and limbs of most any...",                   # text field name - description
    "identification": "Plants growing at windy sites often have smaller-than-normal leav...",                   # text field name - identification
    "damage": "Wind-damaged leaves become necrotic along the margins and tips and drop p...",                   # text field name - damage
    "disorder_development": "Wind commonly causes water deficit. If soil moisture is low...",                   # text field name - disorder_development
    "solutions": "Provide plants with proper cultural care, especially appropriate irrig...",                   # text field name - solutions
    "images": [                                                                                                 # text field name - images.title, vector - title + images.title
        {
            "link": "http://ipm.ucanr.edu/PMG/W/A-WO-WEAT-FS.002.html?src=exchbt",                              # rename to title
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/W/A-WO-WEAT-FS.002a.jpg?src=exchbt",
            "caption": "Wind-sculptured cypress"
        },
        {
            "link": "http://ipm.ucanr.edu/PMG/W/A-WO-WEAT-FO.041.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/W/A-WO-WEAT-FO.041a.jpg?src=exchbt",
            "caption": "Necrotic, wind-tattered leaves"
        },
        ...
    ]
},
```

__`pestDiseasesItems_new.json`__
```json
{
    "source": "ucipm",
    "name": "Stink bugs",                                                                                       # title field
    "url": "http://ipm.ucanr.edu/PMG/GARDEN/VEGES/PESTS/stinkbug.html?src=exchbt",                              # url field
    "description": "These sucking insects (family Pentatomidae) are shield shaped. The...",                     # text field name - description
    "identification": "Adult stink bugs are distinguished from other insects by the la...",                     # text field name - identification
    "life_cycle": "Stink bugs develop through three life stages: egg, nymph, and adult...",                     # text field name - life_cycle
    "damage": "Adults suck and feed on plants with their strawlike mouthparts. Stink b...",                     # text field name - damage
    "solutions": "Handpick the bugs and their eggs from small plants. Eliminate ground...",                     # text field name - solutions
    "images": [                                                                                                 # text field name - images.title, vector - title + images.title
        {
            "link": "http://ipm.ucanr.edu/PMG/E/I-HM-ECON-AD.015.html?src=exchbt",                              # rename to title
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/E/I-HM-ECON-AD.015a.jpg?src=exchbt",
            "caption": "Adults of southern green stink bug (left), redshouldered stink..."
        },
        {
            "link": "http://ipm.ucanr.edu/PMG/E/I-HM-ECON-EG.001.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/E/I-HM-ECON-EG.001b.jpg?src=exchbt",
            "caption": "First instars of consperse stink bug and their empty egg cases."
        },
        ...
    ]
}
```

__`plantFlowerItems.json`__
```json
{
    "source": "ucipm",
    "name": "Abelia",                                                                                           # title field
    "url": "http://ipm.ucanr.edu/PMG/GARDEN/PLANTS/abelia.html?src=exchbt",                                     # url field
    "identification": "Abelias are evergreen or semievergreen deciduous shrubs. Leaves are ...",                # text field name - identification
    "optimum_conditions": "Abelias can be planted as borders or barriers. Varieties that ...",                  # text field name - optimum_conditions
    "pests_and_disorders": [                                                                                    # text field name - pests_and_disorders.title, vector - title + pests_and_disorders.title
        {
            "problem": "Root knot nematodes",                                                                   # rename to title
            "link": "http://ipm.ucanr.edu/PMG/PESTNOTES/pn7489.html?src=exchbt"
        },
        {
            "problem": "Mineral deficiencies",
            "link": "http://ipm.ucanr.edu/PMG/GARDEN/ENVIRON/mineraldef.html?src=exchbt"
        },
        ...
    ],
    "images": [                                                                                                 # text field name - images.title, vector - title + images.title
        {
            "link": "http://ipm.ucanr.edu/PMG/C/S-WO-CAPR-FL.017.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/C/S-WO-CAPR-FL.017a.jpg?src=exchbt",
            "caption": "Abelia flowers"                                                                         # rename to title
        },
        {
            "link": "http://ipm.ucanr.edu/PMG/C/S-WO-CAPR-FO.011.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/C/S-WO-CAPR-FO.011a.jpg?src=exchbt",
            "caption": "LEaves of Abelia"
        }
    ]
}
```


__`turfPests.json`__
```json
{
    "source": "ucipm",
    "name": "Fiery skipper",                                                                                    # title field
    "url": "http://ipm.ucanr.edu/TOOLS/TURF/PESTS/inskipper.html?src=exchbt",                                   # url field
    "text": "Identification Fiery skipper adults resemble butterflies and are 1 inch...",                       # text field name - description
    "images": [                                                                                                 # text field name - images.title, vector - title + images.title
        {
            "link": "http://ipm.ucanr.edu/PMG/H/I-LP-HPHY-AD.003.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/TOOLS/TURF/IMAGES/PESTMANIM/infieryad.jpg?src=exchbt",
            "caption": "Fiery skipper adult"                                                                    # rename to title
        },
        {
            "link": "http://ipm.ucanr.edu/PMG/H/I-LP-HPHY-LV.009.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/TOOLS/TURF/IMAGES/PESTMANIM/inskiplv.jpg?src=exchbt",
            "caption": "Skipper larva with dark head and thoracic shield"
        },
        ...
    ]
}
```


__`veggitItems_new`__
```json
{
    "source": "ucipm",
    "name": "Carrot ",                                                                                          # title field
    "url": "http://ipm.ucanr.edu/home-and-landscape/carrot/index.html?src=exchbt",                              # url field
    "description": "Carrots can be grown nearly year-round throughout California. A number of vari...",         # text field name - description
    "tips": "Cultural practices such as proper site selection, soil preparation, planting, and wat...",         # text field name - tips
    "images": [                                                                                                 # text field name - images.title, vector - title + images.title
        {
            "link": "http://ipm.ucanr.edu/PMG/P/D-CA-PVIO-RO.002.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/P/D-CA-PVIO-RO.002.jpg?src=exchbt",
            "caption": "Cavity spot-induced, irregularly shaped lesions across mature carrot tap r..."          # rename to title
        },
        ...
    ],
    "pests_and_disorders": [                                                                                    # text field name - pests_and_disorders.title, vector - title + pests_and_disorders.title
        {
            "problem": "Aphids",                                                                                # rename to title
            "link": "http://ipm.ucanr.edu/PMG/PESTNOTES/pn7404.html?src=exchbt"
        },
        {
            "problem": "Carrot rust fly",
            "link": "http://ipm.ucanr.edu/PMG/GARDEN/VEGES/PESTS/carrotrustfly.html?src=exchbt"
        },
        ...
    ]
}
```


__`weedItems.json`__
```json
{
    "source": "ucipm",
    "name": "Gregg arrowhead",                                                                                  # title field
    "url": "http://ipm.ucanr.edu/PMG/WEEDS/gregg_arrowhead.html?src=exchbt",                                    # url field
    "description": "Gregg arrowhead is a native aquatic perennial that occurs in the...",                       # text field name - description
    "images": [                                                                                                 # text field name - images.title, vector title + images.title
        {
            "link": "http://ipm.ucanr.edu/PMG/S/W-AL-SMON-MP.003.html?src=exchbt",
            "caption": "top picture"                                                                            # rename to title
        },
        {
            "link": "http://ipm.ucanr.edu/PMG/S/W-AL-SLON-SG.001.html?src=exchbt",
            "caption": "bottom left picture"
        },
        ...
    ]
}
```

__Final mapping__
```json
{
    # mandatory fields
    "url"       : "url",                            # Main URL
    "source"    : "ucipm|aekb|okstate|orstate",     # Source Dataset
    "title"     : "title",                          # Title of data item
    ...
    # other fields
    ...
    "vectors"   : {
        "name"  : "field_name_and_index",           # Name of the field
        "start" : "number",                         # Start index within text
        "end"   : "number",                         # End index within text
        "vector": "dense_vector",                   # Embedding vector
}
```

## UC IPM data - April 2022 Scrape

In [None]:
_PATH = Path('../data/uc-ipm/scrape_cleaned_Apr2022/')
DATA_FILE_NAMES = sorted(_PATH.iterdir())
[data_file.name for data_file in DATA_FILE_NAMES]

The list of files should be as following:
```python
['FruitVegCulturalItems.json',
 'GardenControlsPestItems.json',
 'GardenControlsPesticideItems.json',
 'PestNotes.json',
 'QuickTips.json',
 'Videos.json',
 'WeedIdItems.json']
```

The corresponding EDA for these sources (links):
* [`FruitVegCulturalItems.json`](#fruitvegculturalitemsjson)
* [`GardenControlsPestItems.json`](#gardercontolspestitemsjson)
* [`GardenControlsPesticideItems.json`](#gardencontrolspesticideitemsjson)
* [`PestNotes.json`](#pestnotesjson)
* [`QuickTips.json`](#quicktipsjson)
* [`Videos.json`](#videosjson)
* [`WeedIdItems.json`](#weediditemsjson)

### Fruit and veggie cultural items
<a id='fruitvegculturalitemsjson'></a>

In [None]:
FILE_NAME = 'FruitVegCulturalItems.json'
df = pd.read_json(Path.joinpath(_PATH, FILE_NAME))
df.info()

In [None]:
print('Field with less than 50% non-null entries:')
for c in df.columns:
    if df[df[c].str.len() > 0].shape[0] / df[c].shape[0] < 0.5:
        print(f'{c:<20} - {(df[df[c].str.len() > 0].shape[0] / df[c].shape[0]) * 100:.0f}%')

In [None]:
df.sample(5)

In [None]:
text_columns = ['name','description', 'images', 'tips_table']
fig, axes = plt.subplots(1, 4, figsize = (30, 5))
for i, col in enumerate(text_columns):
    df[col].apply(len).hist(figure = fig, bins = 30, ax = axes[i])
    axes[i].set_title(col, fontdict = {'fontsize': 20})
plt.show()

__IPM Data - `FruitVegCulturalItems.json`__

| column      | type                                       |
|-------------|--------------------------------------------|
| name        | string                                     |
| url         | string                                     |
| description | string                                     |
| images      | [{src: " ", caption: " "}]                 |
| tips_table  | [{header: " ", row: " "(, row: " ", ...)}] |

Example of the single JSON data entry:
```json
{
    "name": "Planting cucurbits",
    "url": "http://ipm.ucanr.edu/PMG/GARDEN/VEGES/CULTURAL/cantaloupeplant.html?src=exchbt",
    "description": "Cucurbits can be seeded directly or transplanted into the garden. It is ...",
    "images": [
        {
            "src": "http://ipm.ucanr.edu/PMG/GARDEN/IMAGES/CULTURAL/transplant.jpg?src=exchbt",
            "caption": "Transplanting"
        }
    ],
    "tips_table": [
        {
            "header": "Planting tips"
        },
        {
            "row": "Distance in inches "
        },
        {
            "row": "Between plants in rows Between rows "
        },
        {
            "row": "Cucumbers 24 48 "
        },
        {
            "row": "Melons 12 72 "
        },
        {
            "row": "Pumpkins 48 72 "
        },
        {
            "row": "Squash 48 48-72"
        }
    ]
}
```
__Metadata on data source__

Information on fruits and veggies for cultivation purposes.

### Garden control - pest related items
<a id='gardercontolspestitemsjson'></a>

In [None]:
FILE_NAME = 'GardenControlsPestItems.json'
df = pd.read_json(Path.joinpath(_PATH, FILE_NAME))
df.info()

In [None]:
print('Field with less than 50% non-null entries:')
for c in df.columns:
    if df[df[c].str.len() > 0].shape[0] / df[c].shape[0] < 0.5:
        print(f'{c:<20} - {(df[df[c].str.len() > 0].shape[0] / df[c].shape[0]) * 100:.0f}%')

In [None]:
df.sample(5)

In [None]:
text_columns = ['name','description', 'images']
fig, axes = plt.subplots(1, 3, figsize = (30, 5))
for i, col in enumerate(text_columns):
    df[col].apply(len).hist(figure = fig, bins = 30, ax = axes[i])
    axes[i].set_title(col, fontdict = {'fontsize': 20})
plt.show()

__IPM Data - `GardenControlsPestItems.json`__

| column      | type                                       |
|-------------|--------------------------------------------|
| name        | string                                     |
| url         | string                                     |
| description | string                                     |
| images      | [{src: " ", caption: " "}]                 |

Example of the single JSON data entry:
```json
{
    "name": "Parasites",
    "url": "http://ipm.ucanr.edu/PMG/GARDEN/CONTROLS/parasites.html?src=exchbt",
    "description": "Insect parasites (parasitoids) are smaller than their hosts and develop inside, or attached to the ...",
    "images": [
        {
            "link": "http://ipm.ucanr.edu/PMG/S/I-LP-SCON-AS.001.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/S/I-LP-SCON-AS.001a.jpg?src=exchbt",
            "caption": "Apanteles cocoons"
        },
        {
            "link": "",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/PESTICIDES/parasitelifecycle.jpg?src=exchbt",
            "caption": "Life cycle of a Hyposoter parasite"
        },
        {
            "link": "http://ipm.ucanr.edu/PMG/S/I-LP-SCON-HF.002.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/S/I-LP-SCON-HF.002a.jpg?src=exchbt",
            "caption": "Redhumped caterpillars parasitized by Hyposoter wasps"
        }
    ]
}
```
__Metadata on data source__

Information on pest control remedies from pest perspective.

### Garden control - pesticide related items
<a id='gardencontrolspesticideitemsjson'></a>

In [None]:
FILE_NAME = 'GardenControlsPesticideItems.json'
df = pd.read_json(Path.joinpath(_PATH, FILE_NAME))
df.info()

In [None]:
print('Field with less than 50% non-null entries:')
for c in df.columns:
    if df[df[c].str.len() > 0].shape[0] / df[c].shape[0] < 0.5:
        print(f'{c:<20} - {(df[df[c].str.len() > 0].shape[0] / df[c].shape[0]) * 100:.0f}%')

In [None]:
df.sample(5)

In [None]:
text_columns = ['active_ingredient','pesticide_type', 'information']
fig, axes = plt.subplots(1, 3, figsize = (30, 5))
for i, col in enumerate(text_columns):
    df[col].apply(len).hist(figure = fig, bins = 30, ax = axes[i])
    axes[i].set_title(col, fontdict = {'fontsize': 20})
plt.show()

In [None]:
information_columns = ["acute_toxicity", "long_term_toxicity", "water_quality_rating", "impact_on_natural_enemies", "impact_on_honey_bees", "associated_pests"]
fig, axes = plt.subplots(2, 3, figsize = (30, 10))
for i, col in enumerate(information_columns):
    r = i//3
    c = i%3
    df['information'].apply(lambda x: x[0][col]).apply(len).hist(figure = fig, bins = 30, ax = axes[r][c])
    axes[r][c].set_title(col, fontdict = {'fontsize': 20})
plt.show()

__IPM Data - `GardenControlsPesticideItems.json`__

| column            | type                                                                                                                                                          |
|-------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------|
| active_ingredient | string                                                                                                                                                        |
| url               | string                                                                                                                                                        |
| pesticide_type    | string                                                                                                                                                        |
| information       | [{acute_toxicity: " ", long_term_toxicity: " ", water_quality_rating: " ", impact_on_natural_enemies: " ", impact_on_honey_bees: " ", associated_pests: " "}] |

Example of the single JSON data entry:
```json
{
    "active_ingredient": "Potassium bicarbonate",
    "url": "http://ipm.ucanr.edu/TOOLS/PNAI/pnaishow.php?id=60?src=exchbt",
    "pesticide_type": "fungicide",
    "information": [
        {
            "acute_toxicity": "Toxicity rating: No information",
            "long_term_toxicity": "On US EPA list: Not listed; On CA Proposition list: Not listed",
            "water_quality_rating": "Overall runoff risk rating: No information Notes: No information available",
            "impact_on_natural_enemies": "Overall toxicity rating: No information",
            "impact_on_honey_bees": "Toxicity category: No information",
            "associated_pests": "Powdery Mildew on Ornamentals, Roses in the Garden and Landscape: Diseases and Abiotic Disorders, powdery mildew"
        }
    ]
}
```
__Metadata on data source__

Information on pesticide control remedies with details.

### Pest notes from UC IPM
<a id='pestnotesjson'></a>

In [None]:
FILE_NAME = 'PestNotes.json'
df = pd.read_json(Path.joinpath(_PATH, FILE_NAME))
df.info()

In [None]:
print('Field with less than 50% non-null entries:')
for c in df.columns:
    if df[df[c].str.len() > 0].shape[0] / df[c].shape[0] < 0.5:
        print(f'{c:<20} - {(df[df[c].str.len() > 0].shape[0] / df[c].shape[0]) * 100:.0f}%')

In [None]:
df.sample(5)

In [None]:
text_columns = ['name','descriptionPestNote', 'lifecyclePestNote', 'damagePestNote', 'managementPestNote', 'imagePestNote', 'tablePestNote']

fig, axes = plt.subplots(2, 4, figsize = (30, 10))
for i, col in enumerate(text_columns):
    r = i//4
    c = i%4
    df[col].apply(len).hist(figure = fig, bins = 20, ax = axes[r][c])
    axes[r][c].set_title(col, fontdict = {'fontsize': 20})
plt.show()

__IPM Data - `PestNotes.json`__

| column              | type                                  |
|---------------------|---------------------------------------|
| name                | string                                |
| urlPestNote         | string                                |
| descriptionPestNote | string                                |
| lifeCycle           | string                                |
| damagePestNote      | string                                |
| managementPestNote  | string                                |
| imagePestNote       | [{link: " ", src: " ", caption: " "}] |
| tablePestNote       | [" ", " "]                            |

Example of the single JSON data entry:
```json
{
    "name": "Aphids",
    "urlPestNote": "http://ipm.ucanr.edu/PMG/PESTNOTES/pn7404.html?src=exchbt",
    "descriptionPestNote": "Aphids are small, soft-bodied insects with long slender mouthparts that they ...",
    "lifecyclePestNote": "Aphids have soft pear-shaped bodies with long legs and antennae and may be green, ...",
    "damagePestNote": "Low to moderate numbers of leaf-feeding aphids aren't usually damaging in gardens ...",
    "managementPestNote": "Although aphids seldom kill a mature plant, the damage they do and unsightly ...",
    "imagePestNote": [
        {
            "caption": "Wingless adults and nymphs of the potato aphid.",
            "link": "http://ipm.ucanr.edu/PMG/M/I-HO-MEUP-NM.006.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/M/I-HO-MEUP-NM.006h.jpg?src=exchbt"
        },
        {
            "caption": "Woolly apple aphid adults showing waxy coating.",
            "link": "http://ipm.ucanr.edu/PMG/E/I-HO-ELAN-AD.001.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/E/I-HO-ELAN-AD.001h.jpg?src=exchbt"
        },
        ...
    ],
    "tablePestNote": [
        "<table class=\"indextable\" id=\"TABLE1\" style=\"width:100%;\">\n  <caption>Table 1. Common Aphids on Vegetables...",
        "<table class=\"indextable\" id=\"TABLE2\" style=\"width:100%;\">\n  <caption>Table 2. Common Aphids of Fruit Trees.",
        ...
    ]
}
```
__Metadata on data source__

Data about the pests. `descriptionPestNote` can be used to describe the pest. `damagePestNote` can be used to match damage description. Target can extracted from main fields as well as image and video captions. Following slots can be used for filter - `plant_name`, `problem`, `problem_description`, `target`. 
Notes:
* `descriptionPestNote`, `life_cycle`, `managementPestNote` - main fields.
* `imagePestNote/caption` - additional fields.



### Quick tips from UC IPM
<a id='quicktipsjson'></a>

In [None]:
FILE_NAME = 'QuickTips.json'
df = pd.read_json(Path.joinpath(_PATH, FILE_NAME))
df.info()

In [None]:
print('Field with less than 50% non-null entries:')
for c in df.columns:
    if df[df[c].str.len() > 0].shape[0] / df[c].shape[0] < 0.5:
        print(f'{c:<20} - {(df[df[c].str.len() > 0].shape[0] / df[c].shape[0]) * 100:.0f}%')

In [None]:
df.sample(5)

In [None]:
text_columns = ['name','contentQuickTips', 'imageQuickTips']

fig, axes = plt.subplots(1, 3, figsize = (30, 5))
for i, col in enumerate(text_columns):
    df[col].apply(len).hist(figure = fig, bins = 30, ax = axes[i])
    axes[i].set_title(col, fontdict = {'fontsize': 20})
plt.show()

__IPM Data - `QuickTips.json`__

| column              | type                                  |
|---------------------|---------------------------------------|
| name                | string                                |
| urlQuickTip         | string                                |
| contentQuickTip     | string                                |
| imageQuickTips      | [{link: " ", src: " ", caption: " "}] |

Example of the single JSON data entry:
```json
{
    "name": "Bark Beetles",
    "urlQuickTip": "http://ipm.ucanr.edu/QT/barkbeetlescard.html?src=exchbt",
    "contentQuickTips": "Bark beetles are common pests of many trees, but some of the most damaging ...",
    "imageQuickTips": [
        {
            "link": "http://ipm.ucanr.edu/PMG/I/I-CO-IPAR-AD.001.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/I/I-CO-IPAR-AD.001h.jpg?src=exchbt",
            "caption": "Ips bark beetle (actual size 1/8 to 3/8 inch long)."
        },
        {
            "link": "http://ipm.ucanr.edu/PMG/I/I-CO-ISPP-CD.004.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/I/I-CO-ISPP-CD.004h.jpg?src=exchbt",
            "caption": "Engraver beetle holes and sap."
        },
        ...
    ]
},
},
```
__Metadata on data source__

Data about the quick tips on pests. `contentQuickTips` can be used to describe the quick tips.
Notes:
* `contentQuickTips` - main fields.
* `imageQuickTips/caption` - additional fields.

### Videos from UC IPM
<a id='videosjson'></a>

In [None]:
FILE_NAME = 'Videos.json'
df = pd.read_json(Path.joinpath(_PATH, FILE_NAME))
df.info()

In [None]:
print('Field with less than 50% non-null entries:')
for c in df.columns:
    if df[df[c].str.len() > 0].shape[0] / df[c].shape[0] < 0.5:
        print(f'{c:<20} - {(df[df[c].str.len() > 0].shape[0] / df[c].shape[0]) * 100:.0f}%')

In [None]:
df.sample(5)

In [None]:
text_columns = ['title','description']

fig, axes = plt.subplots(1, 2, figsize = (30, 5))
for i, col in enumerate(text_columns):
    df[col].apply(len).hist(figure = fig, bins = 30, ax = axes[i])
    axes[i].set_title(col, fontdict = {'fontsize': 20})
plt.show()

__IPM Data - `Videos.json`__

| column              | type                                  |
|---------------------|---------------------------------------|
| title               | string                                |
| url                 | string                                |
| description         | string                                |

Example of the single JSON data entry:
```json
{
    "title": "How to Monitor for Aphids in Plum and Prune",
    "url": "https://www.youtube.com/watch?v=kbm55xeQlWs?src=exchbt",
    "description": "Learn how to monitor for leaf curl plum aphids and mealy plum aphids in plum and prune orchards.  Find out more at http://ipm.ucanr.edu"
}
```
__Metadata on data source__

Data about the videos from UC IPM (all videos are in YouTube originally). `description` can be used to describe the video.
Notes:
* `description` - main fields.

### Weed Items
<a id='weediditemsjson'></a>

In [None]:
FILE_NAME = 'WeedIdItems.json'
df = pd.read_json(Path.joinpath(_PATH, FILE_NAME))
df.info()

In [None]:
print('Field with less than 50% non-null entries:')
for c in df.columns:
    if df[df[c].str.len() > 0].shape[0] / df[c].shape[0] < 0.5:
        print(f'{c:<20} - {(df[df[c].str.len() > 0].shape[0] / df[c].shape[0]) * 100:.0f}%')

In [None]:
df.sample(5)

In [None]:
text_columns = ['name','description', 'images']
fig, axes = plt.subplots(1, 3, figsize = (30, 5))
for i, col in enumerate(text_columns):
    df[col].apply(len).hist(figure = fig, bins = 30, ax = axes[i])
    axes[i].set_title(col, fontdict = {'fontsize': 20})
plt.show()

__IPM Data - `WeedIdItems.json`__

| column      | type                                       |
|-------------|--------------------------------------------|
| name        | string                                     |
| url         | string                                     |
| description | string                                     |
| images      | [{src: " ", caption: " "}]                 |

Example of the single JSON data entry:
```json
{
    "name": "Growth habit",
    "url": "http://ipm.ucanr.edu/PMG/WEEDS/ID/broadhabit.html?src=exchbt",
    "description": "Broadleaves may grow prostrate and form a mat or they may grow upright.",
    "images": [
        {
            "src": "http://ipm.ucanr.edu/TOOLS/TURF/IMAGES/PESTMANIM/prostrate_upright.jpg?src=exchbt",
            "caption": "A plant growing prostrate and upright"
        }
    ]
}
````
__Metadata on data source__

Information on weed pests.

### Final Transformation


Data source will be mapped to following structure:
```json
{
    # mandatory fields
    "url"       : "url",                            # Main URL
    "source"    : "ucipm|aekb|okstate|orstate",     # Source Dataset
    "title"     : "title",                          # Title of data item
    ...
    # other fields
    ...
    "vectors"   : {
        "name"  : "field_name_and_index",           # Name of the field
        "start" : "number",                         # Start index within text
        "end"   : "number",                         # End index within text
        "vector": "dense_vector",                   # Embedding vector
}
```

Title itself will be encoded as well into vector field.


__Fruit and Veggie Cultural__

```json
{
    "name": "Planting cucurbits",                                                                   # title field
    "url": "http://ipm.ucanr.edu/PMG/GARDEN/VEGES/CULTURAL/cantaloupeplant.html?src=exchbt",        # url field
    "description": "Cucurbits can be seeded directly or transplanted into the garden. It is ...",   # text field name - description
    "images": [                                                                                     # text field name - images.title, vector - title + images.title
        {
            "src": "http://ipm.ucanr.edu/PMG/GARDEN/IMAGES/CULTURAL/transplant.jpg?src=exchbt",
            "caption": "Transplanting"
        }
    ],
    "tips_table": [                                                                                 # text field name - tips_table.title, vector - title + tips_table.title
        {
            "header": "Planting tips"                                                               # rename to title and concatenate with main title
        },
        {
            "row": "Distance in inches "                                                            # rename to title and concatenate with header (title) and main title
        },
        {
            "row": "Between plants in rows Between rows "
        },
        {
            "row": "Cucumbers 24 48 "
        },
        {
            "row": "Melons 12 72 "
        },
        {
            "row": "Pumpkins 48 72 "
        },
        {
            "row": "Squash 48 48-72"
        }
    ]
}
```
__Garden Pest Control__
```json
{
    "name": "Parasites",                                                                                                        # title field
    "url": "http://ipm.ucanr.edu/PMG/GARDEN/CONTROLS/parasites.html?src=exchbt",                                                # url field
    "description": "Insect parasites (parasitoids) are smaller than their hosts and develop inside, or attached to the ...",    # text field name - description
    "images": [                                                                                                                 # text field name - images.title, vector - title + images.title
        {
            "link": "http://ipm.ucanr.edu/PMG/S/I-LP-SCON-AS.001.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/S/I-LP-SCON-AS.001a.jpg?src=exchbt",
            "caption": "Apanteles cocoons"                                                                                      # rename to title
        },
        {
            "link": "",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/PESTICIDES/parasitelifecycle.jpg?src=exchbt",
            "caption": "Life cycle of a Hyposoter parasite"
        },
        {
            "link": "http://ipm.ucanr.edu/PMG/S/I-LP-SCON-HF.002.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/S/I-LP-SCON-HF.002a.jpg?src=exchbt",
            "caption": "Redhumped caterpillars parasitized by Hyposoter wasps"
        }
    ]
}
```
__Garden Pesticide Control__
```json
{
    "active_ingredient": "Potassium bicarbonate",                                                                       # title field
    "url": "http://ipm.ucanr.edu/TOOLS/PNAI/pnaishow.php?id=60?src=exchbt",                                             # url field
    "pesticide_type": "fungicide",                                                                                      # merge with title
    "information": [                                                                                                    # text field name - information (concatenate subfields)
        {
            "acute_toxicity": "Toxicity rating: No information",                                                        # Concatenate with field name and text
            "long_term_toxicity": "On US EPA list: Not listed; On CA Proposition list: Not listed",                     # Concatenate with field name and text
            "water_quality_rating": "Overall runoff risk rating: No information Notes: No information available",       # Concatenate with field name and text
            "impact_on_natural_enemies": "Overall toxicity rating: No information",                                     # Concatenate with field name and text
            "impact_on_honey_bees": "Toxicity category: No information",                                                # Concatenate with field name and text
            "associated_pests": "Powdery Mildew on Ornamentals, Roses in the Garden and Landscape: Diseases and ..."    # Concatenate with field name and text
        }
    ]
}
```
__Pest Notes__
```json
{
    "name": "Aphids",                                                                                       # title field (NOTE: remove PestNote suffix from the field names)
    "urlPestNote": "http://ipm.ucanr.edu/PMG/PESTNOTES/pn7404.html?src=exchbt",                             # url field
    "descriptionPestNote": "Aphids are small, soft-bodied insects with long  ...",                          # text field name - description
    "lifecyclePestNote": "Aphids have soft pear-shaped bodies with long legs  ...",                         # text field name - lifecycle
    "damagePestNote": "Low to moderate numbers of leaf-feeding aphids aren't ...",                          # text field name - damage
    "managementPestNote": "Although aphids seldom kill a mature plant, the ...",                            # text field name - management
    "imagePestNote": [                                                                                      # text field name - image.title, vector - title + image.title
        {
            "caption": "Wingless adults and nymphs of the potato aphid.",                                   # rename to title
            "link": "http://ipm.ucanr.edu/PMG/M/I-HO-MEUP-NM.006.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/M/I-HO-MEUP-NM.006h.jpg?src=exchbt"
        },
        {
            "caption": "Woolly apple aphid adults showing waxy coating.",
            "link": "http://ipm.ucanr.edu/PMG/E/I-HO-ELAN-AD.001.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/E/I-HO-ELAN-AD.001h.jpg?src=exchbt"
        },
        ...
    ],
    "tablePestNote": [                                                                                      # omit for now (requested change in the structure from Lauren)
        "<table class=\"indextable\" id=\"TABLE1\" style=\"width:100%;\">\n  <caption>Table 1. Common Aphids on Vegetables...",
        "<table class=\"indextable\" id=\"TABLE2\" style=\"width:100%;\">\n  <caption>Table 2. Common Aphids of Fruit Trees.",
        ...
    ]
}
```
__Quick Tips__
```json
{
    "name": "Bark Beetles",                                                                                 # title field (NOTE: remove QuickTip suffix from the field names)
    "urlQuickTip": "http://ipm.ucanr.edu/QT/barkbeetlescard.html?src=exchbt",                               # url field
    "contentQuickTips": "Bark beetles are common pests of many trees, but some of the most damaging ...",   # text field name - content
    "imageQuickTips": [                                                                                     # text field name - image.title, vector - title + image.title
        {
            "link": "http://ipm.ucanr.edu/PMG/I/I-CO-IPAR-AD.001.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/I/I-CO-IPAR-AD.001h.jpg?src=exchbt",
            "caption": "Ips bark beetle (actual size 1/8 to 3/8 inch long)."                                # rename to title
        },
        {
            "link": "http://ipm.ucanr.edu/PMG/I/I-CO-ISPP-CD.004.html?src=exchbt",
            "src": "http://ipm.ucanr.edu/PMG/IMAGES/I/I-CO-ISPP-CD.004h.jpg?src=exchbt",
            "caption": "Engraver beetle holes and sap."
        },
        ...
    ]
}
```
__Videos__
```json
{
    "title": "How to Monitor for Aphids in Plum and Prune",                                                 # title field
    "url": "https://www.youtube.com/watch?v=kbm55xeQlWs?src=exchbt",                                        # url field
    "description": "Learn how to monitor for leaf curl plum aphids and mealy plum aphids in plum ..."       # text field name - description
}
```
__Weed Pests__
```json
{
    "name": "Growth habit",                                                                                 # title field
    "url": "http://ipm.ucanr.edu/PMG/WEEDS/ID/broadhabit.html?src=exchbt",                                  # url field
    "description": "Broadleaves may grow prostrate and form a mat or they may grow upright.",               # text field name - description
    "images": [                                                                                             # text field name - images.title, vector - title + images.title
        {
            "src": "http://ipm.ucanr.edu/TOOLS/TURF/IMAGES/PESTMANIM/prostrate_upright.jpg?src=exchbt",
            "caption": "A plant growing prostrate and upright"                                              # rename to title
        }
    ]
}
```

__Final mapping__
```json
{
    # mandatory fields
    "url"       : "url",                            # Main URL
    "source"    : "ucipm|aekb|okstate|orstate",     # Source Dataset
    "title"     : "title",                          # Title of data item
    ...
    # other fields
    ...
    "vectors"   : {
        "name"  : "field_name_and_index",           # Name of the field
        "start" : "number",                         # Start index within text
        "end"   : "number",                         # End index within text
        "vector": "dense_vector",                   # Embedding vector
}
```

## AskExtension Data

In [None]:
_PATH = Path('../data/askextension_kb/')
DATA_FILE_NAMES = sorted(_PATH.iterdir())

print(f'List of files:\n{[data_file.name for data_file in DATA_FILE_NAMES]}')

with open(DATA_FILE_NAMES[0]) as f:
    f = json.load(f)
    print(json.dumps(f[0], indent = 2))

### Data

The list of files should be as following:
```python
['2012-2013.json', '2014-2015.json', '2016-2017.json', '2018-2019.json', '2020-1.json', '2020-2.json', '2021-1.json', '2021-2.json']
```

It is a list of dictionary objects with following fields:
- `faq-id` - ID of the ticket
- `title` - title of the ticket along ID of the ticket (__other__ ID)
- `created` - ticket creating date
- `updated` - ticket last update date
- `tags` - list of tags
- `state` - state ticket was created in
- `county` - county ticket was created in
- `question` - question that has been posted
- `answer` - response lists presented in numbered dictionary data type

Final structure:
```text
url - composed field out of `faq-id` and main URL   # url field
title - title                                       # title field
response - `answer.response                         # text field name - answer.response

__NB__: We will filtering in only tickets from California, Oregon, and Oklahoma

### Final mapping

Data source will be mapped to following structure:
```json
{
    # mandatory fields
    "url"       : "url",                            # Main URL
    "source"    : "ucipm|aekb|okstate|orstate",     # Source Dataset
    "title"     : "title",                          # Title of data item
    ...
    # other fields
    ...
    "vectors"   : {
        "name"  : "field_name_and_index",           # Name of the field
        "start" : "number",                         # Start index within text
        "end"   : "number",                         # End index within text
        "vector": "dense_vector",                   # Embedding vector
}
```
Title itself will be encoded as well into vector field.

## Oklahoma State University Data

### EDA

In [None]:
_PATH = Path('../data/okstate/fact-sheets-out-cleaner.json')
df = pd.read_json(_PATH)
df.info()

Data looks as follows:
```json
{
"title":,                               # title field
"link":,                                # url field
"thumbnail":,
"description":,                         # text field name - description
"content": [                            
    {
        "header":,                      # text field name - content.header, vector - title + header
        "text":,                        # text field name - content.text, vector - text
        "parent_header":,
        "images": {
            "image_urls": [ ],
            "image_captions": [ ],
            "image_indexes": [ ]
        },
        "Table": {
            "raw_table_text": [],
            "table_indexes": []
        }
    },
],
"author": ...,
"pubdate": "2017-07-01T08:12:21 (ex)",
"category": [ ],
"displaydate": "2017-07-01T08:12:21 (ex)"
}
```

For the sake of simplicity, we will disregard following columns - `thumbnail`, `author`, `pubdate`, `category`, `displaydate`.

Also for the `content` field, we will concatenate `header` with `title` (except for the value `Introduction-w/o-header`), and use only field `header` and `text` within `content` field.

So the final data structure will look like as follows:
```json
{
    "url": "link",
    "title": "title",
    "description": "description",
    "content": [{
        "header": "title" + "header",
        "text": "text"
    }]
}

In [None]:
df.sample(5)

In [None]:
text_columns = ['title', 'description']

fig, axes = plt.subplots(1, 2, figsize = (30, 5))
for i, col in enumerate(text_columns):
    df[col].apply(len).hist(figure = fig, bins = 30, ax = axes[i])
    axes[i].set_title(col, fontdict = {'fontsize': 20})
plt.show()

In [None]:
temp_data = {
    'title': [],
    'text': []
}

for i, item in df.iterrows():
    temp_data['title'].extend([l['header'] for l in item['content']])
    temp_data['text'].extend([l['text'] for l in item['content']])

temp = pd.DataFrame(temp_data)
fig, axes = plt.subplots(1, 2, figsize = (30, 5))
for i, col in enumerate(['title', 'text']):
    temp[col].apply(len).hist(figure = fig, bins = 30, ax = axes[i])
    axes[i].set_title(col, fontdict = {'fontsize': 20})
plt.show()

In [None]:
print('Examples of long titles:')
for i, item in df.iterrows():
    main_url = False
    for l1 in item['content']:
        if len(l1['header']) > 150:
            if not main_url:
                print(item['link'])
                main_url = True
            print(f'Item "{l1["header"]}" has length of {len(l1["header"])} chars')
    if main_url:
        print(' ')

In [None]:
print('Examples of long content text:')
for i, item in df.iterrows():
    main_url = False
    for i1, l1 in enumerate(item['content']):
        if len(l1['text']) > 10000:
            if not main_url:
                print(item['link'])
                main_url = True
            print(f'{i1} Item "{l1["header"]}" has length of over 10000 chars - {len(l1["text"])}')
    if main_url:
        print(' ')
    

### Final mapping

Data source will be mapped to following structure:
```json
{
    "data"      {                                   # Dynamic field
        ...                                         # Data itself
    }
    "url"       : "url",                            # Main URL
    "source"    : "ucipm|aekb|okstate|orstate",     # Source Dataset
    "title"     : "title",                          # Title of data item
    "vectors"   : {
        "name"  : "field_name_and_index",           # Name of the field
        "start" : "number",                         # Start index within text
        "end"   : "number",                         # End index within text
        "vector": "dense_vector",                   # Embedding vector
}
```

Title itself will be encoded as well into vector field.

## Oregon State University Data

### EDA

In [None]:
_PATH = Path('../data/orstate/OSU-Out-Cleaner.json')
df = pd.read_json(_PATH)
df.info()

Data looks as follows:
```json
{
"title":,                               # title field
"link":,                                # url field
"thumbnail":,
"description":,                         # text field name - description
"content": [                            
    {
        "header":,                      # text field name - content.header, vector - title + header
        "text":,                        # text field name - content.text, vector - text
        "parent_header":,
        "images": {
            "image_urls": [ ],
            "image_captions": [ ],
            "image_indexes": [ ]
        },
        "Table": {
            "raw_table_text": [],
            "table_indexes": []
        }
    },
],
"author": ...,
"pubdate": "2017-07-01T08:12:21 (ex)",
"category": [ ],
"displaydate": "2017-07-01T08:12:21 (ex)"
}
```

For the sake of simplicity, we will disregard following columns - `thumbnail`, `author`, `pubdate`, `category`, `displaydate`.

Also for the `content` field, we will concatenate `header` with `title` (except for the value `Introduction-w/o-header`), and use only field `header` and `text` within `content` field.

In [None]:
df.sample(5)

In [None]:
text_columns = ['title', 'description']

fig, axes = plt.subplots(1, 2, figsize = (30, 5))
for i, col in enumerate(text_columns):
    df[col].apply(len).hist(figure = fig, bins = 30, ax = axes[i])
    axes[i].set_title(col, fontdict = {'fontsize': 20})
plt.show()

In [None]:
temp_data = {
    'title': [],
    'text': []
}

for i, item in df.iterrows():
    temp_data['title'].extend([l['header'] for l in item['content']])
    temp_data['text'].extend([l['text'] for l in item['content']])

temp = pd.DataFrame(temp_data)
fig, axes = plt.subplots(1, 2, figsize = (30, 5))
for i, col in enumerate(['title', 'text']):
    temp[col].apply(len).hist(figure = fig, bins = 30, ax = axes[i])
    axes[i].set_title(col, fontdict = {'fontsize': 20})
plt.show()

In [None]:
print('Examples of long titles:')
for i, item in df.iterrows():
    main_url = False
    for l1 in item['content']:
        if len(l1['header']) > 150:
            if not main_url:
                print(item['link'])
                main_url = True
            print(f'Item "{l1["header"]}" has length of {len(l1["header"])} chars')
    if main_url:
        print(' ')

In [None]:
print('Examples of long content text:')
for i, item in df.iterrows():
    main_url = False
    for i1, l1 in enumerate(item['content']):
        if len(l1['text']) > 10000:
            if not main_url:
                print(item['link'])
                main_url = True
            print(f'{i1} Item "{l1["header"]}" has length of over 10000 chars - {len(l1["text"])}')
    if main_url:
        print(' ')
    

### Final mapping

Data source will be mapped to following structure:
```json
{
    "data"      {                                   # Dynamic field
        ...                                         # Data itself
    }
    "url"       : "url",                            # Main URL
    "source"    : "ucipm|aekb|okstate|orstate",     # Source Dataset
    "title"     : "title",                          # Title of data item
    "vectors"   : {
        "name"  : "field_name_and_index",           # Name of the field
        "start" : "number",                         # Start index within text
        "end"   : "number",                         # End index within text
        "vector": "dense_vector",                   # Embedding vector
}
```

Title itself will be encoded as well into vector field.