# EDA and ETL for scraped data from IPM and AskExtension data knowledge

In [1]:
import json
import pandas as pd
import os

## IPM Data

In [2]:
PATH = '../data/updated/'
FILE_AE = 'ipmdata_new.json'

In [3]:
df = pd.read_json(PATH + FILE_AE)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   name                 214 non-null    object
 1   urlPestNote          214 non-null    object
 2   descriptionPestNote  214 non-null    object
 3   life_cycle           214 non-null    object
 4   damagePestNote       214 non-null    object
 5   managementPestNote   214 non-null    object
 6   imagePestNote        214 non-null    object
 7   tablePestNote        214 non-null    object
 8   urlQuickTip          214 non-null    object
 9   contentQuickTips     214 non-null    object
 10  imageQuickTips       214 non-null    object
 11  video                214 non-null    object
dtypes: object(12)
memory usage: 20.2+ KB


In [4]:
df.head()

Unnamed: 0,name,urlPestNote,descriptionPestNote,life_cycle,damagePestNote,managementPestNote,imagePestNote,tablePestNote,urlQuickTip,contentQuickTips,imageQuickTips,video
0,California Oakworm,http://ipm.ucanr.edu/PMG/PESTNOTES/pn7422.html...,The California oakworm (Phryganidia californic...,"The adult, called an oak moth, is a uniform ta...",Young oakworm caterpillars skeletonize the lea...,Pesticide sprays applied to control oakworm us...,[{'caption': 'Male (bottom) and female Califor...,"[<table border=""1"" cellpadding=""2"" cellspacing...",,,[],[]
1,Bark Beetles,http://ipm.ucanr.edu/PMG/PESTNOTES/pn7421.html...,"Bark beetles, family Scolytidae, are common pe...","Bark beetle adults are small, cylindrical, har...",Bark beetles mine the inner bark (the phloem-c...,Except for general cultural practices that imp...,"[{'caption': 'Engraver beetle holes and sap.',...","[<table border=""1"" cellpadding=""2"" cellspacing...",http://ipm.ucanr.edu/QT/barkbeetlescard.html?s...,"Bark beetles are common pests of many trees, b...",[{'link': 'http://ipm.ucanr.edu/PMG/I/I-CO-IPA...,[]
2,Boxelder Bug,http://ipm.ucanr.edu/PMG/PESTNOTES/pn74114.htm...,The western boxelder bug (Boisea rubrolineata)...,"When full grown, the boxelder bug is about 1/2...",The bugs do little damage to ornamental trees....,Boxelder bugs do not cause significant damage ...,"[{'caption': 'Boxelder bug adult and nymphs.',...",[],,,[],[]
3,Bee and Wasp Stings,http://ipm.ucanr.edu/PMG/PESTNOTES/pn7449.html...,Nearly everyone has been stung by an insect at...,,,,"[{'caption': 'Yellowjacket.', 'link': 'http://...",[],,,[],[]
4,Black Scale,http://ipm.ucanr.edu/PMG/PESTNOTES/pn74160.htm...,"Black scale, Saissetia oleae, is a soft scale ...",The adult female black scale is the easiest of...,Black scale can adversely affect plant vigor a...,Management of black scale in the home landscap...,[{'caption': 'Black scale adult with “H” ridge...,[],,,[],[]


## Description of columns

Columns:
- `name`
    - Name of the pest
    - __inject__ into ES, create __embedding__
- `urlPestNote`
    - URL of the pest
    - __inject__ into ES
- `descriptionPestNote`
    - Description of the pest
    - __inject__ into ES, create __embedding__
- `life_cycle`
    - Desciption of the life-cycle of the pest
    - __inject__ into ES, create __embedding__
- `damagePestNote`
    - Description of the damage caused by this pest
    - __inject__ into ES, create __embedding__
- `managementPestNote`
    - Some notes on how to control and fight against this pest
    - __inject__ into ES, create __embedding__
- `imagePestNote`
    - Some images regarding the pest
    - __discard__ for moment
- `tablePestNote`
    - Table information on pest
    - __discard__ for moment
- `urlQuickTip`
    - Links to some additional information on the pests
    - __discard__ for moment
- `contentQuickTips`
    - Additional information on the pest
    - __discard__ for moment
- `imageQuickTips`
    - Some addiotinal links to images related to pests (similar to `imagePestNote`)
    - __discard__ for moment
- `video`
    - Links to video related to pests
    - __discard__ for moment

In [5]:
'''
['askextensiondata-california.json',
 'exoticPests.json',
 'fruitItems_new.json',
 'fruitVeggieEnvironItems_new.json',
 'ipmdata_new.json',
 'pestDiseaseItems_new.json',
 'plantFlowerItems.json',
 'turfPests.json',
 'veggieItems_new.json',
 'weedItems.json']
'''
DATA_FILE_NAMES = sorted(os.listdir('..//data/updated/'))

In [7]:
def docs_etl():
    """Read all docs and pre-process them"""
    print("ETL for docs...")

    df_docs_json = {}
    for data_file_name in DATA_FILE_NAMES:
        path_data = f"{PATH}{data_file_name}"
        df_docs_json[data_file_name] = pd.read_json(path_data)

        if "name" in df_docs_json[data_file_name].columns:
            before_shape = df_docs_json[data_file_name].shape
            df_docs_json[data_file_name] = df_docs_json[data_file_name].drop_duplicates(
                "name"
            )
            after_shape = df_docs_json[data_file_name].shape
            num_dropped = before_shape[0] - after_shape[0]
            if num_dropped > 0:
                print(f"Dropped {num_dropped} with same 'name' in {data_file_name}")

    
    df_docs_json = unique_column_names(df_docs_json)

    df_docs = concat_docs(df_docs_json)

    df_docs.index = df_docs.index.set_names("doc_id")

    df_docs = replace_nan(df_docs)

    return df_docs

def unique_column_names(df_docs_json):
    """Column names must be unique and identify the file they came from"""

    df_docs_json["pestDiseaseItems_new.json"] = df_docs_json[
        "pestDiseaseItems_new.json"
    ].rename(
        columns={
            "url": "urlPestDiseaseItems",
            "description": "descriptionPestDiseaseItems",
            "identification": "identificationPestDiseaseItems",
            "life_cycle": "life_cyclePestDiseaseItems",
            "damage": "damagePestDiseaseItems",
            "solutions": "solutionsPestDiseaseItems",
            "images": "imagesPestDiseaseItems",
        }
    )

    df_docs_json["turfPests.json"] = df_docs_json[
        "turfPests.json"
    ].rename(
        columns={
            "url": "urlTurfPests",
            "text": "textTurfPests",
            "images": "imagesTurfPests",
        }
    )

    df_docs_json["weedItems.json"] = df_docs_json[
        "weedItems.json"
    ].rename(
        columns={
            "url": "urlWeedItems",
            "description": "descriptionWeedItems",
            "images": "imagesWeedItems",
        }
    )

    df_docs_json["exoticPests.json"] = df_docs_json[
        "exoticPests.json"
    ].rename(
        columns={
            "url": "urlExoticPests",
            "description": "descriptionExoticPests",
            "damage": "damageExoticPests",
            "identification": "identificationExoticPests",
            "life_cycle": "life_cycleExoticPests",
            "monitoring": "monitoringExoticPests",
            "management": "managementExoticPests",
            "related_links": "related_linksExoticPests",
        }
    )

    df_docs_json["askextensiondata-california.json"] = df_docs_json[
        "askextensiondata-california.json"
    ].rename(
        columns={
            "faq-id": "ask_faq_id",
            "url": "ask_url",
            "title": "ask_title",
            "title-question": "ask_title_question",
            "created": "ask_created",
            "updated": "ask_updated",
            "state": "ask_state",
            "county": "ask_county",
            "question": "ask_question",
            "answer": "ask_answer",
            "attachments": "ask_attachments",
        }
    )

    return df_docs_json

def concat_docs(df_docs_json):
    """Concatenate the docs of different json files into one big dataframe"""
    df_docs = pd.concat(
        [df_docs_json[name] for name in DATA_FILE_NAMES], ignore_index=True
    )
    return df_docs


def replace_nan(df_docs):
    """Replace all NaN values in the dataframe with appropriate content"""
    df_docs = df_docs.fillna("")
    # nested types & arrays require an empty list if non existing
    for column in [
        "imagePestNote",
        "imageQuickTips",
        "video",
        "imagesPestDiseaseItems",
        "other_headersPestDiseaseItems",
        "imagesTurfPests",
        "imagesWeedItems",
        "related_linksExoticPests",
        "imagesExoticPests",
        "ask_answer",
        "ask_attachments",
    ]:
        df_docs[column] = [[] if x == "" else x for x in df_docs[column]]

    return df_docs


docs_etl()

ETL for docs...
Dropped 28 with same 'name' in fruitVeggieEnvironItems_new.json
Dropped 1 with same 'name' in ipmdata_new.json
Dropped 39 with same 'name' in pestDiseaseItems_new.json


KeyError: 'cleanedPestDiseaseItems.json'