# ETL for the scoring data (questions and corresponding links to sources from UC IPM)

In [None]:
from urllib.parse import urlparse

def _uri_validator(x):
    try:
        result = urlparse(x)
        return all([result.scheme, result.netloc])
    except:
        return False

In [None]:
import pandas as pd

UCIPM_DATA  = './data/raw/UCIPM_Chatbot_Core_Questions 2020_08_19.xlsx - Questions with the correct link.csv'
AE_DATA     = './data/raw/AE_test_QA_chatbot.xlsx - Questions with the correct link.csv'
VALID_DATA  = './data/transformed/valid_questions.pkl'
NA_DATA     = './data/transformed/na_questions.pkl'

df = pd.read_csv(UCIPM_DATA)

df.sample(10)

## ETL for questions with answer links

In [None]:
print(f'Size of DF (before): {df.shape}')
df = df[df['Crawled'] == 'Y']
print(f'Size of DF (after ): {df.shape}')


df['Question'   ] = df['Question'   ].apply(lambda x: x.split('\n')[0]              )
df['URL'        ] = df['URL'        ].apply(lambda x: x.split('\n')                 )
df['URL'        ] = df['URL'        ].apply(lambda x: [x1.split('?')[0] for x1 in x])

for urls in df['URL']:
    for u in urls:
        assert urlparse(u)

df['Source'] = 'UCIPM'

ucipm = df[['Question', 'Pest', 'URL', 'Source']]

ucipm.sample(10)

In [None]:
df = pd.read_csv(AE_DATA)

df.sample(10)

In [None]:
print(f'Size of DF (before): {df.shape}')
df = df[df['URL'].notna()       ]
df = df[df['URL'] != 'no_answer']
print(f'Size of DF (after ): {df.shape}')

df['URL'] = df['URL'].apply(lambda x: x.split('\n')                 )
df['URL'] = df['URL'].apply(lambda x: [x1.split('?')[0] for x1 in x])

for urls in df['URL']:
    for u in urls:
        assert urlparse(u)

df['Source' ] = 'AE'
df['Pest'   ] = ''

ae = df[['Question', 'Pest', 'URL', 'Source']]

ae.sample(10)

In [None]:
df = pd.concat([ucipm, ae])

df.to_pickle(VALID_DATA)
print(f'Shape of final DF: {df.shape}')
df.sample(10)

In [None]:
df = pd.read_pickle(VALID_DATA)
print(f'Shape of data: {df.shape}')
df.sample(10)

## ETL for `NA` and `no_answer` questions 

In [None]:
df = pd.read_csv(AE_DATA)

df.sample(10)

In [None]:
print(f'Size of DF (before): {df.shape}')
df = df[(df['URL'].isna()) & (df['URL'] != 'no_answer')]
print(f'Size of DF (after ): {df.shape}')

df['Source' ] = 'AE'

df = df[['Question', 'Source']]

df.sample(10)

In [None]:
df.to_pickle(NA_DATA)
print(f'Shape of final DF: {df.shape}')
df.sample(10)

In [None]:
df = pd.read_pickle(NA_DATA)
print(f'Shape of data: {df.shape}')
df.sample(10)