# Ask Extension data EDA

In [1]:
import pandas as pd

import json
import os

In [2]:
PATH = '../data/askextension/2020-08-20/'
FILE_NAMES = [PATH + f for f in sorted(os.listdir(PATH))]

## Data

Data constists 4 files:
- 2012-2014.json
- 2014-2016.json
- 2016-2018.json
- 2018-2020.json

And looks following way:

In [3]:
with open(FILE_NAMES[0]) as f:
    f = json.load(f)
    print(json.dumps(f[0], indent = 2))

{
  "faq-id": 3,
  "title": "When can I plant blue spruce trees in Colorado? #109900",
  "created": "2012-12-03 15:53:47",
  "updated": "2012-12-03 17:47:21",
  "tags": [
    "trees and shrubs"
  ],
  "state": "Colorado",
  "county": "El Paso County",
  "question": "I need to plant two blue spruce trees that are currently in 24\" diameter plastic containers with drain holes in the bottom sides.\n\nLocation: northeast side of Colorado Springs.\n\nThese trees are currently outside on the patio and susceptible to the wind and sun. The trees were watered this past Saturday and seem to be healthy.\n\nQuestion: Can these trees be planted now? Currently the soil is not frozen and night time temps are 35 to 40 degrees.\n\nI have downloaded and read CMG GardenNotes #633 as a reference.\n\nAny advice would be greatly appreciated. ",
  "answer": {
    "1": {
      "response": "Jerry, \nyou can plant them now (a) OR temporarily \"plant\" them, still in containers, so that roots have some insulatio

It is a list of dictionary objects with following fields:
- `faq-id` - ID of the ticket
- `title` - title of the ticket along ID of the ticket (__other__ ID)
- `created` - ticket creating date
- `updated` - ticket last update date
- `tags` - list of tags
- `state` - state ticket was created in
- `county` - county ticket was created in
- `question` - question that has been posted
- `answer` - response lists presented in numbered dictionary data type

## EDA

TO BE DONE

## ETL

Following transformations are performed:
- merge source into single JSON file
- filter out ticket from all states except _California_
- retrive ticket ID from title
- create URL of ticket
- clean long text columns - `title`, `question`, `answer`
- create new column out of `title` and `question`
- remove tickets that have less than 3 words in `title` and `quesiton` combined
- limit the number of characters to 300 for `title`, `question`, `answer` columns

Save result to ES JSON format to be injected to ES.

In [4]:
import sys
import os
import re

from string import punctuation as pn

Modify `STATE_FILTER`, `MIN_WORD_COUNT`, and `MAX_STR_LEN` variables accordingly.

In [5]:
STATE_FILTER    = ['California']
MIN_WORD_COUNT  = 3
MAX_STR_LEN     = 1000
ASKEXTENSION_QUESTION_URL = 'https://ask2.extension.org/kb/faq.php?id='

# Combines the data files into one and returns it.
df = pd.DataFrame()
for f in FILE_NAMES:
    df = df.append(pd.read_json(f), ignore_index = True)
    
# Convert 'faq-id' to str type
df['faq-id'] = df['faq-id'].astype(str)

# Leave tickets from California state
df = df[df['state'].isin(STATE_FILTER)]

# Add the URL and leave blank URL for questions with no ID
df['url'] = [
    f"{ASKEXTENSION_QUESTION_URL}{ticket_no}" if len(ticket_no) == 6 else ""
    for ticket_no in df['title'].str.split('#').str[-1]
]

# Add the ticket number from title and leave blank for questions without
df['ticket-no'] = [
    ticket_no if len(ticket_no) == 6 else ""
    for ticket_no in df['title'].str.split('#').str[-1]
]

def _clean(text):
    '''
    Fix encodings and remove escape and redundant whitespace characters from text.

    Examples with non-ascii characters - 110358, 147160
    Examples with redundant whitespace - 117069, 127760

    See: https://stackoverflow.com/a/53821967/5480536
    '''
    text = text.encode('ascii', 'ignore').decode()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def _transform_answer(answer_dict):
    '''
    Convert answer field from a dictionary to a list.
    '''
    answers = [{}] * len(answer_dict)
    
    for k, v in answer_dict.items():
        # clean the response up
        v['response'] = _clean(v['response'])
        answers[int(k) - 1] = v
    
    return answers

# Transform answer for consistency with IPM data
df['answer'] = df['answer'].apply(_transform_answer)

# Strip all spaces and remove non-ascii characters from text fields
for column in ['state', 'title', 'question']:
    df[column] = df[column].apply(_clean)

def _transform_title(title):
    '''
    Remove question ID from title, and append '.' in the end
    if no punctuation was detected.

    Example with '#' - 437259
    Example with '...' - 437264
    '''
    title = "".join(title.split('#')[:-1]).strip().strip('...')
    
    # add a '.' if it does not yet end with a punctuation
    title = title if (title and title[-1] in pn) else title + '.'
    
    return title

# Clean ID and '...' from title, and append punctuation if not present
df['title'] = df['title'].apply(_transform_title)

def _merge_title_question(df):
    '''
    Create new column from questions and title,
    but only if it is not already exactly in the question.
    '''
    titles      = df["title"].tolist()
    questions   = df["question"].tolist()
    
    tqs = [
        question
        if (title and question.startswith(title[:-1]))
        else title + " " + question
        for (title, question) in zip(titles, questions)
    ]

    return tqs

# Create new column from `title` and `question`, or only question
# if title is exactly the question     
df['title_question'] = _merge_title_question(df)
    
# Remove questions with small number words in title-question
if MIN_WORD_COUNT:
    df = df[df['title_question'].str.split().str.len() > MIN_WORD_COUNT]


# Trim extremely long questions or responses, if constraint given:
if MAX_STR_LEN:
    df['question'       ] = df['question'       ].str[:MAX_STR_LEN]
    df['title_question' ] = df['title_question' ].str[:MAX_STR_LEN]
    
    answers = df['answer']
    for answer in answers:
        for response in answer:
            response['response'] = response['response'][:MAX_STR_LEN]
    df['answer'] = answers

df = df.loc[:, ['ticket-no', 'url', 'created', 'tags', 'title', 'question', 'title_question', 'answer']]
df.sample(5)

Unnamed: 0,ticket-no,url,created,tags,title,question,title_question,answer
100469,458357,https://ask2.extension.org/kb/faq.php?id=458357,2018-05-31 14:07:15,[plant identification],Orange Globs.,These hard shiny globs have been on this rosem...,Orange Globs. These hard shiny globs have been...,[{'response': 'Thank you for your question. I ...
5809,140675,https://ask2.extension.org/kb/faq.php?id=140675,2013-07-15 02:33:30,"[fruits and vegetables, gardening, insect issu...",Bugs on my zucchini.,Planted zucchini and cucumber for the first ti...,Bugs on my zucchini. Planted zucchini and cucu...,"[{'response': 'Hi, thanks for contacting us. I..."
38700,262072,https://ask2.extension.org/kb/faq.php?id=262072,2015-07-15 13:46:45,[],extension offices.,How can I locate an extension office near me?T...,extension offices. How can I locate an extensi...,[{'response': 'Here is the web site for your c...
25145,214073,https://ask2.extension.org/kb/faq.php?id=214073,2014-09-24 16:15:27,[],A- 1 zoning.,if you have property in an A-1 zoning witch is...,A- 1 zoning. if you have property in an A-1 zo...,[{'response': 'Typically agriculture is define...
51803,306378,https://ask2.extension.org/kb/faq.php?id=306378,2016-03-11 15:52:05,"[insect identification, insect issues]",Bug eating my plants.,Is this bug eating my plants It make a ball sh...,Bug eating my plants. Is this bug eating my pl...,[{'response': 'This critter is not eating your...


## Embedding text fields into vectors

In [6]:
sys.path.insert(1, os.path.realpath(os.path.pardir))

os.environ['STAGE']         = 'dev'
os.environ['ES_USERNAME']   = 'elastic'
os.environ['ES_PASSWORD']   = 'changeme'
os.environ['ES_HOST']       = 'http://localhost:9200/'
os.environ['ES_IMITATE']    = 'false'

import config

INFO:config:----------------------------------------------
INFO:config:Environment variables are for DEV environment
INFO:config:----------------------------------------------
INFO:config:----------------------------------------------
INFO:config:Elasticsearch configuration:
INFO:config:- host                    = http://localhost:9200/
INFO:config:- username                = elastic
INFO:config:- password                = changeme
INFO:config:- tfhub_embedding_url     = https://tfhub.dev/google/universal-sentence-encoder/4
INFO:config:- tfhub_cache_dir         = /var/tmp/tfhub_modules
INFO:config:----------------------------------------------
INFO:config:----------------------------------------------
INFO:config:Elasticsearch indexes:
INFO:config:- askextension index      = askextension
INFO:config:- combined index          = combined
INFO:config:- problem index           = problem
INFO:config:- information index       = information
INFO:config:----------------------------------------

In [7]:
# ---------------------------------------- Title-question field embedding
c_list      = df['title_question'].tolist()
c_vectors   = config.embed(c_list).numpy().tolist()
df['title_question_vector'] = c_vectors
# ---------------------------------------- Tags field embedding
tags = []
for ts in df['tags']:
    if len(ts) == 0:
        tags.append([])
    else:
        ts_vector = config.embed(ts).numpy().tolist()
        tags.append([{'tag': ts[i], 'tag_vector': ts_vector[i]} for i in range(len(ts))])
df['tags'] = tags
# ---------------------------------------- Answers field embedding
tmp = df['answer']
for i, _ in enumerate(tmp):
    if len(tmp.iloc[i]) > 0:
        c_vectors = config.embed([i['response'] for i in tmp.iloc[i]]).numpy().tolist()
        for j, _ in enumerate(tmp.iloc[i]):
            tmp.iloc[i][j]['response_vector'] = c_vectors[j]

df.sample(5)

Unnamed: 0,ticket-no,url,created,tags,title,question,title_question,answer,title_question_vector
123572,581904,https://ask2.extension.org/kb/faq.php?id=581904,2019-07-20 18:26:46,"[{'tag': 'wildlife', 'tag_vector': [-0.0603984...",Help us identify animal scat in backyard.,We have had this animal scat show up in our fe...,Help us identify animal scat in backyard. We h...,[{'response': 'Thank you for your question. Ou...,"[-0.04714950546622276, 0.058499403297901154, -..."
31151,236284,https://ask2.extension.org/kb/faq.php?id=236284,2015-04-11 23:30:00,"[{'tag': 'trees and shrubs', 'tag_vector': [-0...",We have two bushes-smal.,"We have two ""bushes""-small evergreen trees wit...","We have two bushes-smal. We have two ""bushes""-...",[{'response': 'I could not tell what the plant...,"[-0.06957925856113434, -0.03359018638730049, 0..."
2729,126507,https://ask2.extension.org/kb/faq.php?id=126507,2013-05-11 00:17:39,"[{'tag': 'honeybees', 'tag_vector': [0.0040176...",Scarlet Pimpernel and honeybees.,I have read that the scarlet pimpernel flower ...,Scarlet Pimpernel and honeybees. I have read t...,[{'response': 'The scarlet pimpernel does not ...,"[0.049440860748291016, 0.006719441618770361, 0..."
72522,367614,https://ask2.extension.org/kb/faq.php?id=367614,2016-10-03 22:40:16,"[{'tag': 'weeds', 'tag_vector': [0.03885586932...",Italian arum.,How do I eradicate this species? It has a arro...,Italian arum. How do I eradicate this species?...,[{'response': 'Sounds like a very hard one to ...,"[-0.009997852146625519, -0.02012038044631481, ..."
113863,547025,https://ask2.extension.org/kb/faq.php?id=547025,2019-03-14 02:35:10,[],very small Bug found in home.,"seemed to drop from the ceiling, landing on my...",very small Bug found in home. seemed to drop f...,[{'response': 'This is a dermestid beetle. Not...,"[-0.07425697147846222, 0.03894948214292526, 0...."


In [8]:
askextension_mapping = {
    "settings": {
    "number_of_shards": 1
    },
    "mappings": {
        "properties": {
            "ticket-no" : {"type": "text"},
            "url"       : {"type": "text"},
            "created"   : {"type": "text"},
            "title"     : {"type": "text"},
            "question"  : {"type": "text"},
            "title_question"        : {"type": "text"},
            "title_question_vector" : {"type": "dense_vector", "dims": 512},
            "answer"    : {
                "type"      : "nested",
                "properties": {
                    "response"  : {"type": "text"},
                    "author"    : {"type": "text"},
                    "response_vector"   : {"type": "dense_vector", "dims": 512}
                }
            }
        }
    }
}


df_json = df.to_dict('records')

In [10]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk

from collections import deque


es_client = Elasticsearch([config.es_host], http_auth=(config.es_username, config.es_password))

es_client.indices.delete(
    index   = config.es_askextension_index, 
    ignore  = 404)
es_client.indices.create(
    index       = config.es_askextension_index               , 
    settings    = askextension_mapping['settings'] , 
    mappings    = askextension_mapping['mappings'] )        
deque(parallel_bulk(es_client, actions = df_json, index = config.es_askextension_index), maxlen = 0)

INFO:elasticsearch:GET http://localhost:9200/ [status:200 request:0.002s]
INFO:elasticsearch:DELETE http://localhost:9200/askextension [status:404 request:0.001s]
INFO:elasticsearch:PUT http://localhost:9200/askextension [status:200 request:0.056s]
INFO:elasticsearch:POST http://localhost:9200/askextension/_bulk [status:200 request:1.174s]
INFO:elasticsearch:POST http://localhost:9200/askextension/_bulk [status:200 request:1.123s]
INFO:elasticsearch:POST http://localhost:9200/askextension/_bulk [status:200 request:0.276s]
INFO:elasticsearch:POST http://localhost:9200/askextension/_bulk [status:200 request:1.026s]
INFO:elasticsearch:POST http://localhost:9200/askextension/_bulk [status:200 request:1.429s]


deque([])