In [2]:
import pandas as pd
import numpy as np

import json
import os

from string import punctuation as pn

In [13]:
PATH = '../data/askextension/'
FILE_NAMES = sorted(os.listdir(PATH))

In [16]:
def remove_encodings_and_escapes(text):
    '''See: https://stackoverflow.com/a/53821967/5480536'''
    return text.encode('ascii', 'ignore').decode().replace('\n', ' ').replace('"', '"').strip()

In [17]:
def merge_scraped_data_files(source_files, target_file):
    """Combines the exported data files into one. Writes it out in readable format"""
    data = []
    for exported_data_file in source_files:
        with open(exported_data_file) as f:
            data.extend(json.load(f))

    print(f"Writing merged file: {target_file}")
    with open(target_file, "w") as f:
        json.dump(data, f, indent=2)

In [18]:
def make_answer_a_list_and_clean_response(answer_dict):
    """convert it from a dictionary into a list"""
    answers = [{}] * len(answer_dict)
    for key, value in answer_dict.items():
        # clean the response up
        value["response"] = remove_encodings_and_escapes(value["response"].strip())
        answers[int(key) - 1] = value
    return answers

In [None]:
def etl(path_data, max_word_count = None):
    df = pd.read_json(path_data).set_index('faq-id')
    # for some reason only leave tickets from California county
    df = df[df["state"] == 'California']

    # transform answer for consistency with IPM data
    df["answer"] = [
        make_answer_a_list_and_clean_response(answer_dict)
        for answer_dict in df["answer"]
    ]

    # add the url
    df["url"] = [
        f"https://ask2.extension.org/kb/faq.php?id={faq_id}"
        for faq_id in df.index.tolist()
    ]

    # strip all spaces
    for column in ["state", "title", "question"]:
        df[column] = df[column].str.strip()

    #
    # clean titles
    #
    # strip ' #number' from title
    # strip '...' from titles like https://dev.osticket.eduworks.com/kb/faq.php?id=7826
    #
    titles = df["title"].tolist()
    titles = ["".join(title.split("#")[:-1]).strip().strip("...") for title in titles]
    # add a '.' if it does not yet end with a punctuation
    titles = [
        title if (title and title[-1] in pn) else title + "."
        for title in titles
    ]
    df["title"] = titles


    #
    # Remove questions with less than 2 words in title-question
    shape = df.shape
    mask = [len(x.split()) > 2 for x in df["title-question"].tolist()]
    df = df[mask]
    print(f"Removed {shape[0] - df.shape[0]} title-question with less than 2 words")
    #
    # remove extremely long questions or responses:
    #
    if max_word_count:
        # drop rows with excessive word count in response or question
        mask = [
            (len(q) <= max_word_count and len(r) <= max_word_count)
            for (q, r) in zip(df["title-question"].tolist(), df["response"].tolist())
        ]
        df = df[mask]
        print(f"Removed {shape[0] - df.shape[0]} with more than {max_word_count} words")

    

In [61]:
TMP = PATH + FILE_NAMES[0]
df = pd.read_json(TMP).set_index('faq-id')

In [65]:
df['title']

(11612,)

In [63]:
df.loc[3][['title', 'question']].values

array(['how would I know if my sheep have Enterotoxemia (Overeating Disease) of Sheep #109912',
       'how would I know if my sheep have Enterotoxemia (Overeating Disease) of Shee'],
      dtype=object)

In [25]:
df = df[df["state"] == 'California']

In [41]:
tmp = df[df['answer'].apply(len) >1]
ans = tmp['answer'][474]
answers = [{}] * len(tmp['answer'][474])

for key, value in ans.items():
        # clean the response up
    value["response"] = remove_encodings_and_escapes(value["response"].strip())
    answers[int(key) - 1] = value
answers

[{'response': 'A first question would be: Does the EXISTING lawn consist of any undesirable or weedy grasses? If so, sod removal (scenario 1) is likely to leave behind enough of the old grass that it will grow back - along with the new sod. If undesirable or weedy PERENNIAL grasses (bermudagrass, quackgrass, kikuyagrass, etc) are growing in the current lawn, then the use of glyphosate (Roundup) to control them prior to resodding is essential.Obtaining control with Roundup/glyphosate is more difficult than the second landscaper suggests (whether you go with scenario 1 or 2). To effectively kill perennial grasses (weedy or desirable), it is best to NOT mow the grass "short" before applying the herbicide (the more leaf area, the better the control will be). Also, complete kill will take more than "a few days". The best results are obtained by making two applications of Roundup, about 10-14 days apart. Total kill of the existing grass may take, in other words, about 3-4 weeks.As for scenar

In [73]:
df.shape

(11612, 8)

In [79]:
df

array([['I would like to get an appraisal on several hundred acres of farmland in PickWay County half of which are leased out for farming.  The land is owned by a group of family members now ready to sell.  Where or how can I find qualified expert appraisers to h',
        '2013-07-22 22:42:10', '2013-07-22 22:57:35', list([]),
        'Colorado', 'Denver County',
        'I would like to appraise severl hundres acres in Pickaway County hal of which is being rented out for farming.  It is owned by a consortium of family member though an estate probated a while ago.  Where can I find qualified expert appraisers to hire help determine an appropriate selling price',
        {'1': {'response': 'Is this Pickaway County Ohio?Are you appraising the timber on the farm? Are you a member of the Consortium of Family Members?We would need to find the answers to these questions before we could find an expert\xa0 to answer your question.A link to the OSU Extension Office in Pickaway County can be fo

In [26]:
df.sample(15)

Unnamed: 0_level_0,title,created,updated,tags,state,county,question,answer
faq-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
4769,jasmine? #136109,2013-06-22 21:30:05,2013-06-24 08:51:40,"[houseplants, horticulture]",California,San Luis Obispo County,i bought two jasmine plants a couple of years ...,{'1': {'response': 'Hi --There are many plants...
7494,Identifying a pest on my bell pepper plant #14...,2013-08-20 07:35:41,2013-09-04 14:11:35,"[fruits and vegetables, insect issues, horticu...",California,Fresno County,I just noticed a white pest on my bell pepper ...,{'1': {'response': 'Hello - these insects app...
10563,Hi I just recently am gr... #158942,2013-10-30 01:13:59,2013-11-01 15:52:18,"[fruits and vegetables, horticulture]",California,Los Angeles County,"Hi, I just recently am growing a new garden in...",{'1': {'response': 'Since you did not send a p...
4990,Where is the Equestrian Hemet Crash-Testing vi...,2013-06-27 04:43:43,2013-07-01 13:13:38,"[horses, helmet safety]",California,San Diego County,I've been reccommending the crash-testing vide...,"{'1': {'response': 'You can view the ""Every Ti..."
8595,Hi: I have 30 well establ... #151878,2013-09-09 20:52:54,2013-09-24 02:38:44,"[trees and shrubs, horticulture]",California,Los Angeles County,Hi: I have 30 well established Eugenia brush c...,{'1': {'response': 'Warning! Honey have been ...
3772,dehydrating slilghtly damaged peaches #131381,2013-06-01 20:50:20,2013-07-02 21:46:49,"[fruits and vegetables, food processing, home ...",California,Santa Barbara County,"From Santa Barbara, CA, which may have fog as ...",{'1': {'response': 'Fruit leathers is probably...
10381,Liquidating IRA for debt reduction/cash flow a...,2013-10-24 20:11:59,2013-11-04 13:51:37,"[ira, taxes, credit, home ownership, retirement]",California,Los Angeles County,Should I look into liquidating my IRA account ...,"{'1': {'response': 'Hi, there! This is a multi..."
6864,Should i toss my blackberry jam #145556,2013-08-06 15:10:54,2013-08-06 15:54:16,"[food processing, food safety]",California,San Francisco County,Hi. I made jam on Saturday... now I'm concerne...,{'1': {'response': 'I would like to have you t...
2462,Abnormal testicular placement #125181,2013-05-03 20:03:00,2013-05-03 20:49:58,[],California,San Diego County,I have an 11 month old colt. His left testicle...,{'1': {'response': 'It is very likely that the...
11507,Corn Plant with Yellow and Brown Spots #162733,2013-12-24 01:36:41,2014-01-13 03:47:38,"[corn, houseplants, indoor gardening, horticul...",California,Los Angeles County,Why does my corn plant have yellow spots and d...,{'1': {'response': 'The brown spots might be d...
