# Ask Extension data EDA

In [1]:
import pandas as pd

import json
import os

PATH = '../data/askextension/2020-08-20/'
FILE_NAMES = [PATH + f for f in sorted(os.listdir(PATH))]

## Data

Data constists 4 files:
- 2012-2014.json
- 2014-2016.json
- 2016-2018.json
- 2018-2020.json

And looks following way:

In [2]:
with open(FILE_NAMES[0]) as f:
    f = json.load(f)
    print(json.dumps(f[0], indent = 2))

{
  "faq-id": 3,
  "title": "When can I plant blue spruce trees in Colorado? #109900",
  "created": "2012-12-03 15:53:47",
  "updated": "2012-12-03 17:47:21",
  "tags": [
    "trees and shrubs"
  ],
  "state": "Colorado",
  "county": "El Paso County",
  "question": "I need to plant two blue spruce trees that are currently in 24\" diameter plastic containers with drain holes in the bottom sides.\n\nLocation: northeast side of Colorado Springs.\n\nThese trees are currently outside on the patio and susceptible to the wind and sun. The trees were watered this past Saturday and seem to be healthy.\n\nQuestion: Can these trees be planted now? Currently the soil is not frozen and night time temps are 35 to 40 degrees.\n\nI have downloaded and read CMG GardenNotes #633 as a reference.\n\nAny advice would be greatly appreciated. ",
  "answer": {
    "1": {
      "response": "Jerry, \nyou can plant them now (a) OR temporarily \"plant\" them, still in containers, so that roots have some insulatio

It is a list of dictionary objects with following fields:
- `faq-id` - ID of the ticket
- `title` - title of the ticket along ID of the ticket (__other__ ID)
- `created` - ticket creating date
- `updated` - ticket last update date
- `tags` - list of tags
- `state` - state ticket was created in
- `county` - county ticket was created in
- `question` - question that has been posted
- `answer` - response lists presented in numbered dictionary data type

## EDA

TO BE DONE

## ETL

Following transformations are performed:
- merge source into single JSON file
- filter out ticket from all states except _California_
- retrive ticket ID from title
- create URL of ticket
- clean long text columns - `title`, `question`, `answer`
- create new column out of `title` and `question`
- remove tickets that have less than 3 words in `title` and `quesiton` combined
- limit the number of characters to 300 for `title`, `question`, `answer` columns

Save result to ES JSON format to be injected to ES.

In [3]:
import sys
import re

from string import punctuation as pn

Modify `STATE_FILTER`, `MIN_WORD_COUNT`, and `MAX_STR_LEN` variables accordingly.

In [4]:
STATE_FILTER    = ['California']
MIN_WORD_COUNT  = 3
MAX_STR_LEN     = 1000
ASKEXTENSION_QUESTION_URL = 'https://ask2.extension.org/kb/faq.php?id='

# Combines the data files into one and returns it.
df = pd.DataFrame()
for f in FILE_NAMES:
    df = df.append(pd.read_json(f), ignore_index = True)
    
# Convert 'faq-id' to str type
df['faq-id'] = df['faq-id'].astype(str)

# Leave tickets from California state
df = df[df['state'].isin(STATE_FILTER)]

# Add the URL and leave blank URL for questions with no ID
df['url'] = [
    f"{ASKEXTENSION_QUESTION_URL}{ticket_no}" if len(ticket_no) == 6 else ""
    for ticket_no in df['title'].str.split('#').str[-1]
]

# Add the ticket number from title and leave blank for questions without
df['ticket-no'] = [
    ticket_no if len(ticket_no) == 6 else ""
    for ticket_no in df['title'].str.split('#').str[-1]
]

df.rename(columns = {'faq-id': 'faq_id', 'ticket-no': 'ticket_no'}, inplace = True)

def _clean(text):
    '''
    Fix encodings and remove escape and redundant whitespace characters from text.

    Examples with non-ascii characters - 110358, 147160
    Examples with redundant whitespace - 117069, 127760

    See: https://stackoverflow.com/a/53821967/5480536
    '''
    text = text.encode('ascii', 'ignore').decode()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def _transform_answer(answer_dict):
    '''
    Convert answer field from a dictionary to a list.
    '''
    answers = [{}] * len(answer_dict)
    
    for k, v in answer_dict.items():
        # clean the response up
        v['response'] = _clean(v['response'])
        answers[int(k) - 1] = v
    
    return answers

# Transform answer for consistency with IPM data
df['answer'] = df['answer'].apply(_transform_answer)

# Strip all spaces and remove non-ascii characters from text fields
for column in ['state', 'title', 'question']:
    df[column] = df[column].apply(_clean)

def _transform_title(title):
    '''
    Remove question ID from title, and append '.' in the end
    if no punctuation was detected.

    Example with '#' - 437259
    Example with '...' - 437264
    '''
    title = ''.join(title.split('#')[:-1]).strip().strip('...')
    
    # add a '.' if it does not yet end with a punctuation
    title = title if (title and title[-1] in pn) else title + '.'
    
    return title

# Clean ID and '...' from title, and append punctuation if not present
df['title'] = df['title'].apply(_transform_title)

def _merge_title_question(df):
    '''
    Create new column from questions and title,
    but only if it is not already exactly in the question.
    '''
    titles      = df['title'    ].tolist()
    questions   = df['question' ].tolist()
    
    tqs = [
        question
        if (title and question.startswith(title[:-1]))
        else title + " " + question
        for (title, question) in zip(titles, questions)
    ]

    return tqs

# Create new column from `title` and `question`, or only question
# if title is exactly the question     
df['title_question'] = _merge_title_question(df)
    
# Remove questions with small number words in title-question
if MIN_WORD_COUNT:
    df = df[df['title_question'].str.split().str.len() > MIN_WORD_COUNT]


# Trim extremely long questions or responses, if constraint given:
if MAX_STR_LEN:
    df['question'       ] = df['question'       ].str[:MAX_STR_LEN]
    df['title_question' ] = df['title_question' ].str[:MAX_STR_LEN]
    
    answers = df['answer']
    for answer in answers:
        for response in answer:
            response['response'] = response['response'][:MAX_STR_LEN]
    df['answer'] = answers

df = df.loc[:, ['ticket_no', 'url', 'created', 'tags', 'title', 'question', 'title_question', 'answer']]
df.sample(5)

Unnamed: 0,ticket_no,url,created,tags,title,question,title_question,answer
69628,358972,https://ask2.extension.org/kb/faq.php?id=358972,2016-09-02 18:52:24,"[trees and shrubs, plant identification]",What is this tree?,This tree gives seed pods in the spring. Some ...,What is this tree? This tree gives seed pods i...,[{'response': 'Based on the almost feathery lo...
56890,322610,https://ask2.extension.org/kb/faq.php?id=322610,2016-05-18 05:51:31,"[plant identification, horticulture]",What is this plant?,This plant showed up in my garden. It has leav...,What is this plant? This plant showed up in my...,"[{'response': 'It is a variety of clarkia', 'a..."
59179,329110,https://ask2.extension.org/kb/faq.php?id=329110,2016-06-06 00:57:17,[tomato fungal leaf problem],tomato plant has yellow leaves with black spots.,"From my research, seems to be blight alternari...",tomato plant has yellow leaves with black spot...,"[{'response': 'To whom it may concern, Please ..."
50835,302862,https://ask2.extension.org/kb/faq.php?id=302862,2016-02-25 01:36:05,"[horticulture, plant identification]",What is this?,My fiance died. Left a wild garden I couldn't ...,What is this? My fiance died. Left a wild gard...,[{'response': 'These look more like an ornamen...
72165,366497,https://ask2.extension.org/kb/faq.php?id=366497,2016-09-29 00:29:19,[trees and shrubs],Mayten tree.,We planted a Mayten tree 5 months ago. The tre...,Mayten tree. We planted a Mayten tree 5 months...,[{'response': 'Your tree is a Maytenus boaria....


## Embedding text fields into vectors

In [5]:
sys.path.insert(1, os.path.realpath(os.path.pardir))

os.environ['STAGE']         = 'dev'
os.environ['ES_USERNAME']   = 'elastic'
os.environ['ES_PASSWORD']   = 'changeme'
os.environ['ES_HOST']       = 'https://dev.es.chat.ask.eduworks.com/'
os.environ['ES_IMITATE']    = 'false'

import config

INFO:config:----------------------------------------------
INFO:config:Environment variables are for DEV environment
INFO:config:----------------------------------------------
INFO:config:----------------------------------------------
INFO:config:Elasticsearch configuration:
INFO:config:- host                    = https://dev.es.chat.ask.eduworks.com/
INFO:config:- username                = elastic
INFO:config:- password                = changeme
INFO:config:- tfhub_embedding_url     = https://tfhub.dev/google/universal-sentence-encoder/4
INFO:config:- tfhub_cache_dir         = /var/tmp/tfhub_modules
INFO:config:----------------------------------------------
INFO:config:----------------------------------------------
INFO:config:Elasticsearch indexes:
INFO:config:- askextension index      = askextension
INFO:config:- combined index          = combined
INFO:config:- problem index           = problem
INFO:config:- information index       = information
INFO:config:-------------------------

In [6]:
# ---------------------------------------- Title-question field embedding
c_list      = df['title_question'].tolist()
c_vectors   = config.embed(c_list).numpy().tolist()
df['title_question_vector'] = c_vectors
# ---------------------------------------- Tags field embedding
tags = []
for ts in df['tags']:
    if len(ts) == 0:
        tags.append([])
    else:
        ts_vector = config.embed(ts).numpy().tolist()
        tags.append([{'tag': ts[i], 'tag_vector': ts_vector[i]} for i in range(len(ts))])
df['tags'] = tags
# ---------------------------------------- Answers field embedding
tmp = df['answer']
for i, _ in enumerate(tmp):
    if len(tmp.iloc[i]) > 0:
        c_vectors = config.embed([i['response'] for i in tmp.iloc[i]]).numpy().tolist()
        for j, _ in enumerate(tmp.iloc[i]):
            tmp.iloc[i][j]['response_vector'] = c_vectors[j]

df.sample(5)

Unnamed: 0,ticket_no,url,created,tags,title,question,title_question,answer,title_question_vector
51772,306230,https://ask2.extension.org/kb/faq.php?id=306230,2016-03-10 22:27:10,"[{'tag': 'plant identification', 'tag_vector':...",Plant ID.,"What plant is this? Very furry, gray green fol...","Plant ID. What plant is this? Very furry, gray...",[{'response': 'I believe the plant you have is...,"[-0.06001272797584534, 0.048514965921640396, 0..."
29376,229715,https://ask2.extension.org/kb/faq.php?id=229715,2015-03-05 17:16:19,"[{'tag': 'wildlife damage management', 'tag_ve...",Mountain Lion mass kills.,I live on a large ranch. We have 150 ewes. A w...,Mountain Lion mass kills. I live on a large ra...,[{'response': 'I'm sorry to hear this. Having ...,"[-0.029407260939478874, -0.045724883675575256,..."
97845,450598,https://ask2.extension.org/kb/faq.php?id=450598,2018-04-30 16:08:56,"[{'tag': 'poison oak', 'tag_vector': [0.012958...",How to eliminate poison oak toxins from garden...,Can poison oak toxin be eradicated from compos...,How to eliminate poison oak toxins from garden...,[{'response': 'Thank you for your question. Un...,"[0.0028989508282393217, -0.011688973754644394,..."
70899,362769,https://ask2.extension.org/kb/faq.php?id=362769,2016-09-15 04:00:09,"[{'tag': 'community health leadership', 'tag_v...",Exploitation of immigrants in the U.S.,Are there resources where I can find data conc...,Exploitation of immigrants in the U.S. Are the...,[{'response': 'https://www.google.com/webhp?so...,"[0.017593543976545334, -0.02420208230614662, 0..."
80916,398133,https://ask2.extension.org/kb/faq.php?id=398133,2017-05-06 16:27:49,"[{'tag': 'spiders', 'tag_vector': [-0.00066743...",Spider found in Nevada County CA.,Can you help me indentify this spider,Spider found in Nevada County CA. Can you help...,[{'response': 'It might be a cross web weaver ...,"[-0.033190738409757614, 0.03486495837569237, 0..."


In [7]:
askextension_mapping = {
    "settings": {
    "number_of_shards": 1
    },
    "mappings": {
        "properties": {
            "ticket_no" : {"type": "text"},
            "url"       : {"type": "text"},
            "created"   : {"type": "text"},
            "tags"      : {
                "type"      : "nested",
                "properties": {
                    "tag"       : {"type": "text"},
                    "tag_vector": {"type": "dense_vector", "dims": 512}
                }
            },
            "title"     : {"type": "text"},
            "question"  : {"type": "text"},
            "title_question"        : {"type": "text"},
            "title_question_vector" : {"type": "dense_vector", "dims": 512},
            "answer"    : {
                "type"      : "nested",
                "properties": {
                    "response"  : {"type": "text"},
                    "author"    : {"type": "text"},
                    "response_vector"   : {"type": "dense_vector", "dims": 512}
                }
            }
        }
    }
}


df_json = df.to_dict('records')

In [10]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk

from collections import deque

# increase the timeout if necessary
es_client = Elasticsearch([config.es_host], http_auth=(config.es_username, config.es_password), timeout = 20)

es_client.indices.delete(
    index   = config.es_askextension_index, 
    ignore  = 404)
es_client.indices.create(
    index       = config.es_askextension_index              , 
    settings    = config.ES_ASKEXTENSION_MAPPING['settings'],   
    mappings    = config.ES_ASKEXTENSION_MAPPING['mappings'])        
deque(parallel_bulk(es_client, actions = df_json, index = config.es_askextension_index), maxlen = 0)

es_client.indices.refresh()

INFO:elasticsearch:GET https://dev.es.chat.ask.eduworks.com:443/ [status:200 request:0.871s]
INFO:elasticsearch:DELETE https://dev.es.chat.ask.eduworks.com:443/askextension [status:200 request:0.359s]
INFO:elasticsearch:PUT https://dev.es.chat.ask.eduworks.com:443/askextension [status:200 request:0.615s]
INFO:elasticsearch:POST https://dev.es.chat.ask.eduworks.com:443/askextension/_bulk [status:200 request:7.656s]
INFO:elasticsearch:POST https://dev.es.chat.ask.eduworks.com:443/askextension/_bulk [status:200 request:3.024s]
INFO:elasticsearch:POST https://dev.es.chat.ask.eduworks.com:443/askextension/_bulk [status:200 request:12.945s]
INFO:elasticsearch:POST https://dev.es.chat.ask.eduworks.com:443/askextension/_bulk [status:200 request:15.126s]
INFO:elasticsearch:POST https://dev.es.chat.ask.eduworks.com:443/askextension/_bulk [status:200 request:16.894s]
INFO:elasticsearch:POST https://dev.es.chat.ask.eduworks.com:443/_refresh [status:200 request:0.234s]


{'_shards': {'total': 2, 'successful': 1, 'failed': 0}}