# ETL for the scoring data (questions and corresponding links to sources from UC IPM)

In [1]:
from urllib.parse import urlparse

def _uri_validator(x):
    try:
        result = urlparse(x)
        return all([result.scheme, result.netloc])
    except:
        return False

In [2]:
import pandas as pd

UCIPM_DATA  = './data/raw/UCIPM_Chatbot_Core_Questions 2020_08_19.xlsx - Questions with the correct link.csv'
AE_DATA     = './data/raw/AE_test_QA_chatbot.xlsx - Questions with the correct link.csv'
VALID_DATA  = './data/transformed/valid_questions.pkl'
NA_DATA     = './data/transformed/na_questions.pkl'

df = pd.read_csv(UCIPM_DATA)

df.sample(10)

Unnamed: 0,Pest,Question,URL,Crawled,Training
33,Palm diseases,What's wrong with my palm tree?\n[needed infor...,http://ipm.ucanr.edu/PMG/PESTNOTES/pn74148.htm...,Y,Y
38,scales,What do scale insects look like?\n[needed info...,http://ipm.ucanr.edu/PMG/PESTNOTES/pn7408.html...,Y,Y
8,roof rats,There are roof rats getting into my home. What...,http://ipm.ucanr.edu/PMG/PESTNOTES/pn74106.htm...,Y,Y
25,common bermudagrass,How can I tell the difference between bermudag...,http://ipm.ucanr.edu/PMG/PESTNOTES/pn7453.html...,Y,Y
24,common bermudagrass in flower beds,How do you keep bermudagrass from spreading in...,http://ipm.ucanr.edu/PMG/PESTNOTES/pn7453.html...,Y,Y
3,Peach leaf curl,What is the best way to get rid of peach leaf ...,http://ipm.ucanr.edu/PMG/PESTNOTES/pn7426.html...,Y,Y
34,Palm diseases,If my palm tree has little black spots on it d...,http://ipm.ucanr.edu/PMG/PESTNOTES/pn74148.htm...,Y,Y
45,deer,Is there a way to keep deer out of my garden?,http://ipm.ucanr.edu/PMG/PESTNOTES/pn74117.htm...,Y,Y
26,Vegetable growing,When should I plant asparagus?,http://ipm.ucanr.edu/PMG/GARDEN/VEGES/CULTURAL...,Y,Y
49,weeds in garden,There's some kind of weed taking over my yard....,http://ipm.ucanr.edu/PMG/PESTNOTES/pn7441.html...,Y,Y


## ETL for questions with answer links

In [3]:
print(f'Size of DF (before): {df.shape}')
df = df[df['Crawled'] == 'Y']
print(f'Size of DF (after ): {df.shape}')


df['Question'   ] = df['Question'   ].apply(lambda x: x.split('\n')[0]              )
df['URL'        ] = df['URL'        ].apply(lambda x: x.split('\n')                 )
df['URL'        ] = df['URL'        ].apply(lambda x: [x1.split('?')[0] for x1 in x])

for urls in df['URL']:
    for u in urls:
        assert urlparse(u)

df['Source'] = 'UCIPM'

ucipm = df[['Question', 'Pest', 'URL', 'Source']]

ucipm.sample(10)

Size of DF (before): (50, 5)
Size of DF (after ): (50, 5)


Unnamed: 0,Question,Pest,URL,Source
44,How do I deter deer from my garden?,deer,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn74117.html],UCIPM
15,How do you get rid of aphids on roses?,aphids on roses,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7404.htm...,UCIPM
12,What does rust on roses look like?,rust on roses,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7463.html],UCIPM
17,How did I get carpet beetles in my home?,carpet beetles,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7436.html],UCIPM
24,How do you keep bermudagrass from spreading in...,common bermudagrass in flower beds,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7453.htm...,UCIPM
45,Is there a way to keep deer out of my garden?,deer,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn74117.html],UCIPM
29,Should vegetable gardens be planted in a sunny...,Vegetable growing,[http://ipm.ucanr.edu/PMG/GARDEN/VEGES/CULTURA...,UCIPM
1,How can I get rid of peach leaf curl?,Peach leaf curl,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7426.html],UCIPM
37,What insecticide gets rid of whiteflies?,whiteflies,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7401.html],UCIPM
31,What is the best location for a vegetable garden?,Vegetable growing,[http://ipm.ucanr.edu/PMG/GARDEN/VEGES/CULTURA...,UCIPM


In [4]:
df = pd.read_csv(AE_DATA)

df.sample(10)

Unnamed: 0,Question,URL,Crawled,Training
54,I want to release lady bugs in my garden with ...,http://ipm.ucanr.edu/QT/ladybeetlescard.html\n...,Y,Y
55,We dealing with GRUBS in our garden which we t...,http://ipm.ucanr.edu/TOOLS/TURF/PESTS/inchaf.html,Y,Y
28,For months I've been seeing bugs that look lik...,http://ipm.ucanr.edu/PMG/GARDEN/PLANTS/azalea....,Y,Y
66,I'm interested in hiring a master gardener fro...,,N,N
17,What is the little raised white 'dots' on the ...,http://ipm.ucanr.edu/PMG/PESTNOTES/pn7410.html,Y,Y
23,We have a few cucumber beetles that are eating...,http://ipm.ucanr.edu/PMG/PESTNOTES/pn74167.html,Y,Y
81,Have we had our last frost for spring of 2022?...,,N,N
5,What are these small holes in my maple tree? H...,http://ipm.ucanr.edu/PMG/GARDEN/PLANTS/INVERT/...,Y,Y
78,We have two raised beds and want to add some a...,,N,N
90,I grew several plants outside this last summer...,http://ipm.ucanr.edu/PMG/PESTNOTES/pn7448.html,Y,Y


In [5]:
print(f'Size of DF (before): {df.shape}')
df = df[df['URL'].notna()       ]
df = df[df['URL'] != 'no_answer']
print(f'Size of DF (after ): {df.shape}')

df['URL'] = df['URL'].apply(lambda x: x.split('\n')                 )
df['URL'] = df['URL'].apply(lambda x: [x1.split('?')[0] for x1 in x])

for urls in df['URL']:
    for u in urls:
        assert urlparse(u)

df['Source' ] = 'AE'
df['Pest'   ] = ''

ae = df[['Question', 'Pest', 'URL', 'Source']]

ae.sample(10)

Size of DF (before): (108, 4)
Size of DF (after ): (65, 4)


Unnamed: 0,Question,Pest,URL,Source
76,There is a white powdery substance on the bran...,,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7408.html],AE
36,We have this fly that is very tiny and black. ...,,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn74167.html],AE
97,Is there a way to control or eliminate ants in...,,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7411.htm...,AE
88,I recently had heard that Sluggo will kill ear...,,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7427.htm...,AE
52,I have owned an indoor plant (and do not know ...,,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn74174.ht...,AE
6,Is there treatment for ash borers on ash trees...,,[http://ipm.ucanr.edu/PMG/GARDEN/PLANTS/INVERT...,AE
17,What is the little raised white 'dots' on the ...,,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7410.html],AE
70,I recently removed Ivy that had completely tak...,,[http://ipm.ucanr.edu/PMG/GARDEN/PLANTS/ivy.ht...,AE
60,I have 3 peach trees that are showing discolor...,,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7426.html],AE
29,On a raised bed of strawberries we started dis...,,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7404.html],AE


In [6]:
df = pd.concat([ucipm, ae])

df.to_pickle(VALID_DATA)
print(f'Shape of final DF: {df.shape}')
df.sample(10)

Shape of final DF: (115, 4)


Unnamed: 0,Question,Pest,URL,Source
5,Are Oriental cockroaches harmful?,Oriental cockroach,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7467.html],UCIPM
17,How did I get carpet beetles in my home?,carpet beetles,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7436.html],UCIPM
70,I recently removed Ivy that had completely tak...,,[http://ipm.ucanr.edu/PMG/GARDEN/PLANTS/ivy.ht...,AE
48,"I have flying bugs, like over-sized fruit flie...",,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn74167.html],AE
33,What's wrong with my palm tree?,Palm diseases,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn74148.html],UCIPM
12,Found very large white grub looking worm in fi...,,[http://ipm.ucanr.edu/PMG/GARDEN/PLANTS/INVERT...,AE
46,I have an artichoke plant and it has all these...,,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7404.html],AE
22,I planted seeds but they never came up. What h...,damping off of seeds,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn74132.html],UCIPM
99,My apple trees are infested with apple maggots...,,[http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/PESTS/a...,AE
56,I just noticed an outbreak of small green leaf...,,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7473.htm...,AE


In [7]:
df = pd.read_pickle(VALID_DATA)
print(f'Shape of data: {df.shape}')
df.sample(10)

Shape of data: (115, 4)


Unnamed: 0,Question,Pest,URL,Source
10,How do I get rid of ants in my kitchen?,ants in kitchen,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7411.html],UCIPM
104,We have several boxwood-like bushes in our bac...,,[http://ipm.ucanr.edu/PMG/GARDEN/PLANTS/boxwoo...,AE
43,The leaves on some table/wine grape vines appe...,,[http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/PESTS/g...,AE
30,Can I plant vegetables in the shade?,Vegetable growing,[http://ipm.ucanr.edu/PMG/GARDEN/VEGES/CULTURA...,UCIPM
90,I grew several plants outside this last summer...,,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7448.html],AE
39,How do you stop mealybugs?,mealybugs,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn74174.ht...,UCIPM
77,Our house seems to have much more Boxelder bug...,,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn74114.html],AE
16,Where are the little beetles on my window sill...,carpet beetles,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7436.html],UCIPM
42,How do you kill head lice?,head lice,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7446.html],UCIPM
19,Many of my mature azalea and rhododendron bush...,,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7428.htm...,AE


## ETL for `NA` and `no_answer` questions 

In [8]:
df = pd.read_csv(AE_DATA)

df.sample(10)

Unnamed: 0,Question,URL,Crawled,Training
62,I have three Douglas Fir trees in my yard whic...,,N,Y
61,I'm interested in finding a local source for v...,,N,N
90,I grew several plants outside this last summer...,http://ipm.ucanr.edu/PMG/PESTNOTES/pn7448.html,Y,Y
19,Many of my mature azalea and rhododendron bush...,http://ipm.ucanr.edu/PMG/PESTNOTES/pn7428.html...,Y,Y
43,The leaves on some table/wine grape vines appe...,http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/PESTS/gr...,Y,Y
24,During blooming time both my apricot and plum ...,http://ipm.ucanr.edu/PMG/GARDEN/PLANTS/oak.html,Y,Y
25,what could be leaving these little black pelle...,http://ipm.ucanr.edu/PMG/PESTNOTES/pn7404.html,Y,Y
57,My pear tree looks bad with black spots on lea...,http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/PESTS/pe...,Y,Y
99,My apple trees are infested with apple maggots...,http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/PESTS/ap...,Y,Y
39,I have a problem with fruit flies laying their...,http://ipm.ucanr.edu/PMG/PESTNOTES/pn74158.html,Y,Y


In [9]:
print(f'Size of DF (before): {df.shape}')
df = df[(df['URL'].isna()) & (df['URL'] != 'no_answer')]
print(f'Size of DF (after ): {df.shape}')

df['Source' ] = 'AE'

df = df[['Question', 'Source']]

df.sample(10)

Size of DF (before): (108, 4)
Size of DF (after ): (37, 4)


Unnamed: 0,Question,Source
62,I have three Douglas Fir trees in my yard whic...,AE
67,What are the recommended sprays to use on appl...,AE
102,Where do I find the right plants for my pond?,AE
72,I HAVE A 13 YEAR OLD ROSEBUSH I NEED TO MOVE D...,AE
10,my Night Queen plants have developed brown spo...,AE
103,Is winter a good time to transplant my fruit t...,AE
107,"Should I just mulch with straw, or plant a gro...",AE
105,I have Japanese knotweed in my garden. We have...,AE
87,The trees on my property have long branches th...,AE
82,There are holes approx. 6 inches wide and 6-8 ...,AE


In [10]:
df.to_pickle(NA_DATA)
print(f'Shape of final DF: {df.shape}')
df.sample(10)

Shape of final DF: (37, 2)


Unnamed: 0,Question,Source
95,How can I turn my garden into compost?,AE
32,My coreopsis plants are being eaten by bugs si...,AE
98,How do you tell when pears are ready to pick?,AE
50,"My milkweed have already bloomed, but now ther...",AE
68,"I bought a farm this year, and I'm learning ab...",AE
64,I'm a 4-H alumni. What is the nearest club nea...,AE
96,What is a good soil mix for blueberries?,AE
79,I have a few apple trees. The one has a bent o...,AE
103,Is winter a good time to transplant my fruit t...,AE
34,There are some fast moving tiny bugs in my kit...,AE


In [11]:
df = pd.read_pickle(NA_DATA)
print(f'Shape of data: {df.shape}')
df.sample(10)

Shape of data: (37, 2)


Unnamed: 0,Question,Source
62,I have three Douglas Fir trees in my yard whic...,AE
79,I have a few apple trees. The one has a bent o...,AE
95,How can I turn my garden into compost?,AE
106,I am inquiring if I may submit a soil sample a...,AE
42,I have had weevil infestation in my pantry whi...,AE
83,How to apply fertilizer to our Dwarf Norway Sp...,AE
50,"My milkweed have already bloomed, but now ther...",AE
65,We would like to prepare the soil for a vegeta...,AE
61,I'm interested in finding a local source for v...,AE
33,Anyone know what this is on our bushes? Looks ...,AE
