# ETL for the scoring data (questions and corresponding links to sources from UC IPM)

In [1]:
from urllib.parse import urlparse

def _uri_validator(x):
    try:
        result = urlparse(x)
        return all([result.scheme, result.netloc])
    except:
        return False

In [2]:
import pandas as pd

UCIPM_DATA  = './data/raw/UCIPM_Chatbot_Core_Questions 2020_08_19.xlsx - Questions with correct link.csv'
AE_DATA     = './data/raw/AE_test_QA_chatbot.xlsx - Questions with correct link.csv'
FINAL_DATA  = './data/transformed/result.pkl'

df = pd.read_csv(UCIPM_DATA)

df.sample(10)

Unnamed: 0,Pest,Question,Expected answer reference(s),URL,Crawled,Notes,Alternative
16,carpet beetles,Where are the little beetles on my window sill...,Pest Notes: Carpet Beetles,http://ipm.ucanr.edu/PMG/PESTNOTES/pn7436.html...,Y,,
48,weeds in garden,What's the best way to get rid of weeds in a l...,Pest Notes: Weed Management in landscapes,http://ipm.ucanr.edu/PMG/PESTNOTES/pn7441.html...,Y,,
45,deer,Is there a way to keep deer out of my garden?,Pest Notes: Deer,http://ipm.ucanr.edu/PMG/PESTNOTES/pn74117.htm...,Y,,
41,houseplants,Why are the leaves on my houseplant turning br...,Pest Notes: Houseplant Problems,http://ipm.ucanr.edu/PMG/PESTNOTES/pn74172.htm...,Y,,
10,ants in kitchen,How do I get rid of ants in my kitchen?,Pest Notes: Ants,http://ipm.ucanr.edu/PMG/PESTNOTES/pn7411.html...,Y,,
23,weeds in lawn,How do I kill weeds without killing my lawn?,Pest Notes: Weed Management in Lawns,http://ipm.ucanr.edu/PMG/PESTNOTES/pn74113.htm...,Y,,
18,mysterious bites,What is biting me in bed?,Pest Notes: Bed Bugs\nPest Notes: Conenose Bug...,http://ipm.ucanr.edu/PMG/PESTNOTES/pn7454.html...,Y,,
4,Oriental cockroach,What are the big black bugs on my patio at night?,Pest Notes: Cockroaches,http://ipm.ucanr.edu/PMG/PESTNOTES/pn7467.html...,Y,,
33,Palm diseases,What's wrong with my palm tree?\n[needed infor...,Pest Notes: Palm Diseases in the Landscape,http://ipm.ucanr.edu/PMG/PESTNOTES/pn74148.htm...,Y,,
36,pocket gophers,How do you set a gopher trap?,Pest Notes: Pocket Gophers\nvideo library,http://ipm.ucanr.edu/PMG/PESTNOTES/pn7433.html...,Y,,


In [3]:
print(f'Size of DF (before): {df.shape}')
df = df[df['Crawled'] == 'Y']
print(f'Size of DF (after ): {df.shape}')


df['Question'   ] = df['Question'   ].apply(lambda x: x.split('\n')[0]                              )
df['URL'        ] = df['URL'        ].apply(lambda x: x.split('\n')                                 )
df['URL'        ] = df['URL'        ].apply(lambda x: [x1.split('?')[0] for x1 in x]                )

for urls in df['URL']:
    for u in urls:
        assert urlparse(u)

df['ExpectedAnswer' ] = df['Expected answer reference(s)'   ]
df['ExpectedAnswer' ] = df['ExpectedAnswer'                 ].apply(lambda x: x.split('\n'))


df['Source'         ] = 'UCIPM'

ucipm = df[['Question', 'ExpectedAnswer', 'URL', 'Source']]

ucipm.sample(10)

Size of DF (before): (50, 7)
Size of DF (after ): (42, 7)


Unnamed: 0,Question,ExpectedAnswer,URL,Source
18,What is biting me in bed?,"[Pest Notes: Bed Bugs, Pest Notes: Conenose Bu...",[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7454.htm...,UCIPM
34,If my palm tree has little black spots on it d...,[Pest Notes: Palm Diseases in the Landscape],[http://ipm.ucanr.edu/PMG/PESTNOTES/pn74148.html],UCIPM
46,An animal is eating all the plants in my garde...,[Pest Notes: Deer],[http://ipm.ucanr.edu/PMG/PESTNOTES/pn74117.html],UCIPM
20,Will powdery mildew kill my zucchini?,[Pest Notes: Powdery Mildew on Vegetables],[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7406.html],UCIPM
43,How do you keep mosquitoes away?,[Pest Notes: Mosquitoes],[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7451.html],UCIPM
5,Are Oriental cockroaches harmful?,[Pest Notes: Cockroaches],[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7467.html],UCIPM
23,How do I kill weeds without killing my lawn?,[Pest Notes: Weed Management in Lawns],[http://ipm.ucanr.edu/PMG/PESTNOTES/pn74113.html],UCIPM
22,I planted seeds but they never came up. What h...,[Pest Notes: Damping off Diseases in the Garden ],[http://ipm.ucanr.edu/PMG/PESTNOTES/pn74132.html],UCIPM
16,Where are the little beetles on my window sill...,[Pest Notes: Carpet Beetles],[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7436.html],UCIPM
41,Why are the leaves on my houseplant turning br...,[Pest Notes: Houseplant Problems],[http://ipm.ucanr.edu/PMG/PESTNOTES/pn74172.html],UCIPM


In [4]:
df = pd.read_csv(AE_DATA)

df.sample(10)

Unnamed: 0,Question,URL,Crawled,OtherURL
45,What can I use to get rid of aphids on my toma...,http://ipm.ucanr.edu/PMG/PESTNOTES/pn7404.html...,Y,
53,My azaleas have an infestation of what looks l...,http://ipm.ucanr.edu/PMG/PESTNOTES/pn7428.html...,Y,
47,How can we control the fruit flies we have in ...,https://www2.ipm.ucanr.edu/agriculture/caneber...,Y,
24,During blooming time both my apricot and plum ...,https://www2.ipm.ucanr.edu/agriculture/apricot...,Y,
13,"I have a Mountain Ash tree, about 15 years old...",http://ipm.ucanr.edu/QT/ladybeetlescard.html\r...,N,Y
21,On the leaves of my new bougainvillea there ar...,http://ipm.ucanr.edu/PMG/GARDEN/PLANTS/bougain...,Y,
18,I have a bunch of aphids on our rose bushes. W...,http://ipm.ucanr.edu/PMG/PESTNOTES/pn7404.html,Y,
38,There are a lot of flies on my persimmon tree....,http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/PESTS/fr...,Y,
54,I want to release lady bugs in my garden with ...,http://ipm.ucanr.edu/QT/ladybeetlescard.html\r...,Y,
0,"I keep a colony of springtails, and mites have...",http://ipm.ucanr.edu/PMG/invertebrates/links.m...,Y,


In [5]:
print(f'Size of DF (before): {df.shape}')
df = df[df['URL'].notna()       ]
df = df[df['URL'] != 'no_answer']
print(f'Size of DF (after ): {df.shape}')

df['URL'] = df['URL'].apply(lambda x: x.split('\n')                 )
df['URL'] = df['URL'].apply(lambda x: [x1.split('?')[0] for x1 in x])

for urls in df['URL']:
    for u in urls:
        assert urlparse(u)

df['Source'] = 'AE'

ae = df[['Question', 'URL', 'Source']]

ae.sample(10)

Size of DF (before): (61, 4)
Size of DF (after ): (55, 4)


Unnamed: 0,Question,URL,Source
17,What is the little raised white ‘dots’ on the ...,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7410.html],AE
8,The trunk of my maple was hollow with a lot of...,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7416.htm...,AE
11,"I have tried several ""organic"" solutions to tr...",[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7404.htm...,AE
47,How can we control the fruit flies we have in ...,[https://www2.ipm.ucanr.edu/agriculture/canebe...,AE
53,My azaleas have an infestation of what looks l...,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7428.htm...,AE
23,We have a few cucumber beetles that are eating...,"[http://ipm.ucanr.edu/PMG/r116300511.html\r, h...",AE
52,I have owned an indoor plant (and do not know ...,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn74174.ht...,AE
35,How do I get rid of flea beetle infestation on...,[http://ipm.ucanr.edu/PMG/GARDEN/VEGES/PESTS/f...,AE
12,Found very large white grub looking worm in fi...,[http://ipm.ucanr.edu/PMG/GARDEN/PLANTS/INVERT...,AE
46,I have an artichoke plant and it has all these...,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7404.htm...,AE


In [8]:
df = pd.concat([ucipm, ae])

df.to_pickle(FINAL_DATA)
print(f'Shape of final DF: {df.shape}')
df.sample(10)

Shape of final DF: (97, 4)


Unnamed: 0,Question,ExpectedAnswer,URL,Source
47,I have a garden full of weeds. How do I clear ...,[Pest Notes: Weed Management in landscapes],[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7441.html],UCIPM
0,"I keep a colony of springtails, and mites have...",,[http://ipm.ucanr.edu/PMG/invertebrates/links....,AE
33,Anyone know what this is on our bushes? Looks ...,,[https://www2.ipm.ucanr.edu/agriculture/apple/...,AE
53,My azaleas have an infestation of what looks l...,,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7428.htm...,AE
17,What is the little raised white ‘dots’ on the ...,,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7410.html],AE
39,I have a problem with fruit flies laying their...,,[https://www2.ipm.ucanr.edu/agriculture/canebe...,AE
32,How do you stop blossom end rot on tomatoes?,[Blossom end rot],[http://ipm.ucanr.edu/PMG/GARDEN/VEGES/ENVIRON...,UCIPM
9,There is an army of little black ants on my ki...,,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7411.html],AE
14,There are tons of little green bugs on my rose...,"[Pest Notes: Aphids, Pest Notes: Roses: Insect...",[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7404.htm...,UCIPM
20,Will powdery mildew kill my zucchini?,[Pest Notes: Powdery Mildew on Vegetables],[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7406.html],UCIPM


In [9]:
df = pd.read_pickle(FINAL_DATA)
df

Unnamed: 0,Question,ExpectedAnswer,URL,Source
0,How do you treat peach leaf curl?,[Pest Notes: Peach Leaf Curl],[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7426.html],UCIPM
1,How can I get rid of peach leaf curl?,[Pest Notes: Peach Leaf Curl],[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7426.html],UCIPM
2,If my nectarine tree looks like it has peach l...,[Pest Notes: Peach Leaf Curl],[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7426.html],UCIPM
3,What is the best way to get rid of peach leaf ...,[Pest Notes: Peach Leaf Curl],[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7426.html],UCIPM
4,What are the big black bugs on my patio at night?,[Pest Notes: Cockroaches],[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7467.html],UCIPM
...,...,...,...,...
56,I just noticed an outbreak of small green leaf...,,[http://ipm.ucanr.edu/PMG/PESTNOTES/pn7473.htm...,AE
57,My pear tree looks bad with black spots on lea...,,[http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/PESTS/p...,AE
58,I have a black mold looking fungus growing on ...,,[http://ipm.ucanr.edu/PMG/r102100611.html],AE
59,Does Orthene Systemic insecticide work to cont...,,[https://www2.ipm.ucanr.edu/Invasive-and-Exoti...,AE
