# Ask Extension data EDA

In [1]:
import pandas as pd

import json
import os

PATH = '../data/askextension/2020-08-20/'
FILE_NAMES = [PATH + f for f in sorted(os.listdir(PATH))]

## Data

Data constists 4 files:
- 2012-2014.json
- 2014-2016.json
- 2016-2018.json
- 2018-2020.json

And looks following way:

In [2]:
with open(FILE_NAMES[0]) as f:
    f = json.load(f)
    print(json.dumps(f[0], indent = 2))

{
  "faq-id": 3,
  "title": "When can I plant blue spruce trees in Colorado? #109900",
  "created": "2012-12-03 15:53:47",
  "updated": "2012-12-03 17:47:21",
  "tags": [
    "trees and shrubs"
  ],
  "state": "Colorado",
  "county": "El Paso County",
  "question": "I need to plant two blue spruce trees that are currently in 24\" diameter plastic containers with drain holes in the bottom sides.\n\nLocation: northeast side of Colorado Springs.\n\nThese trees are currently outside on the patio and susceptible to the wind and sun. The trees were watered this past Saturday and seem to be healthy.\n\nQuestion: Can these trees be planted now? Currently the soil is not frozen and night time temps are 35 to 40 degrees.\n\nI have downloaded and read CMG GardenNotes #633 as a reference.\n\nAny advice would be greatly appreciated. ",
  "answer": {
    "1": {
      "response": "Jerry, \nyou can plant them now (a) OR temporarily \"plant\" them, still in containers, so that roots have some insulatio

It is a list of dictionary objects with following fields:
- `faq-id` - ID of the ticket
- `title` - title of the ticket along ID of the ticket (__other__ ID)
- `created` - ticket creating date
- `updated` - ticket last update date
- `tags` - list of tags
- `state` - state ticket was created in
- `county` - county ticket was created in
- `question` - question that has been posted
- `answer` - response lists presented in numbered dictionary data type

## EDA

TO BE DONE

## ETL

Following transformations are performed:
- merge source into single JSON file
- filter out ticket from all states except _California_
- retrive ticket ID from title
- create URL of ticket
- clean long text columns - `title`, `question`, `answer`
- create new column out of `title` and `question`
- remove tickets that have less than 3 words in `title` and `quesiton` combined
- limit the number of characters to 300 for `title`, `question`, `answer` columns

Save result to ES JSON format to be injected to ES.

In [3]:
import sys
import re

from string import punctuation as pn

Modify `STATE_FILTER`, `MIN_WORD_COUNT`, and `MAX_STR_LEN` variables accordingly.

In [4]:
STATE_FILTER    = ['California']
MIN_WORD_COUNT  = 3
MAX_STR_LEN     = 1000
ASKEXTENSION_QUESTION_URL = 'https://ask2.extension.org/kb/faq.php?id='

# Combines the data files into one and returns it.
df = pd.DataFrame()
for f in FILE_NAMES:
    df = df.append(pd.read_json(f), ignore_index = True)
    
# Convert 'faq-id' to str type
df['faq-id'] = df['faq-id'].astype(str)

# Leave tickets from California state
df = df[df['state'].isin(STATE_FILTER)]

# Add the URL and leave blank URL for questions with no ID
df['url'] = [
    f"{ASKEXTENSION_QUESTION_URL}{ticket_no}" if len(ticket_no) == 6 else ""
    for ticket_no in df['title'].str.split('#').str[-1]
]

# Add the ticket number from title and leave blank for questions without
df['ticket-no'] = [
    ticket_no if len(ticket_no) == 6 else ""
    for ticket_no in df['title'].str.split('#').str[-1]
]

df.rename(columns = {'faq-id': 'faq_id', 'ticket-no': 'ticket_no'}, inplace = True)

def _clean(text):
    '''
    Fix encodings and remove escape and redundant whitespace characters from text.

    Examples with non-ascii characters - 110358, 147160
    Examples with redundant whitespace - 117069, 127760

    See: https://stackoverflow.com/a/53821967/5480536
    '''
    text = text.encode('ascii', 'ignore').decode()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def _transform_answer(answer_dict):
    '''
    Convert answer field from a dictionary to a list.
    '''
    answers = [{}] * len(answer_dict)
    
    for k, v in answer_dict.items():
        # clean the response up
        v['response'] = _clean(v['response'])
        answers[int(k) - 1] = v
    
    return answers

# Transform answer for consistency with IPM data
df['answer'] = df['answer'].apply(_transform_answer)

# Strip all spaces and remove non-ascii characters from text fields
for column in ['state', 'title', 'question']:
    df[column] = df[column].apply(_clean)

def _transform_title(title):
    '''
    Remove question ID from title, and append '.' in the end
    if no punctuation was detected.

    Example with '#' - 437259
    Example with '...' - 437264
    '''
    title = ''.join(title.split('#')[:-1]).strip().strip('...')
    
    # add a '.' if it does not yet end with a punctuation
    title = title if (title and title[-1] in pn) else title + '.'
    
    return title

# Clean ID and '...' from title, and append punctuation if not present
df['title'] = df['title'].apply(_transform_title)

def _merge_title_question(df):
    '''
    Create new column from questions and title,
    but only if it is not already exactly in the question.
    '''
    titles      = df['title'    ].tolist()
    questions   = df['question' ].tolist()
    
    tqs = [
        question
        if (title and question.startswith(title[:-1]))
        else title + " " + question
        for (title, question) in zip(titles, questions)
    ]

    return tqs

# Create new column from `title` and `question`, or only question
# if title is exactly the question     
df['title_question'] = _merge_title_question(df)
    
# Remove questions with small number words in title-question
if MIN_WORD_COUNT:
    df = df[df['title_question'].str.split().str.len() > MIN_WORD_COUNT]


# Trim extremely long questions or responses, if constraint given:
if MAX_STR_LEN:
    df['question'       ] = df['question'       ].str[:MAX_STR_LEN]
    df['title_question' ] = df['title_question' ].str[:MAX_STR_LEN]
    
    answers = df['answer']
    for answer in answers:
        for response in answer:
            response['response'] = response['response'][:MAX_STR_LEN]
    df['answer'] = answers

df = df.loc[:, ['ticket_no', 'url', 'created', 'tags', 'title', 'question', 'title_question', 'answer']]
df.sample(5)

Unnamed: 0,ticket_no,url,created,tags,title,question,title_question,answer
122350,577721,https://ask2.extension.org/kb/faq.php?id=577721,2019-07-08 01:41:54,[plums],Green plum oozing.,"I am in Cupertino, California. This is July 7,...","Green plum oozing. I am in Cupertino, Californ...",[{'response': 'Thank you for your question. Yo...
74730,377104,https://ask2.extension.org/kb/faq.php?id=377104,2016-11-15 06:33:29,[],Grafting Cherry trees.,I have a bing that no longer has a pollinator ...,Grafting Cherry trees. I have a bing that no l...,[{'response': 'Late. March is the best time fo...
67075,351537,https://ask2.extension.org/kb/faq.php?id=351537,2016-08-09 15:28:25,[],Is my redwood too far gone to save?,The mandated water restrictions have my 2 redw...,Is my redwood too far gone to save? The mandat...,"[{'response': 'Unfortunately, redwoods like a ..."
56010,320032,https://ask2.extension.org/kb/faq.php?id=320032,2016-05-09 16:59:33,"[plant identification, wildflowers and native ...",Need plant identification.,Please identify possible poisonous plan,Need plant identification. Please identify pos...,[{'response': 'This is a tomatillo (Physalis i...
116559,557177,https://ask2.extension.org/kb/faq.php?id=557177,2019-04-30 19:48:10,[snake identification],Rattlesnake or Gopher snake?,"Hello, I live in Geyserville, California and t...","Rattlesnake or Gopher snake? Hello, I live in ...",[{'response': 'Thank you for your question. Yo...


## Embedding text fields into vectors

In [5]:
sys.path.insert(1, os.path.realpath(os.path.pardir))

os.environ['STAGE']         = 'dev'
os.environ['ES_USERNAME']   = 'elastic'
os.environ['ES_PASSWORD']   = 'changeme'
os.environ['ES_HOST']       = 'http://localhost:9200/'
os.environ['ES_IMITATE']    = 'false'

import config

INFO:config:----------------------------------------------
INFO:config:Environment variables are for DEV environment
INFO:config:----------------------------------------------
INFO:config:----------------------------------------------
INFO:config:Elasticsearch configuration:
INFO:config:- host                    = http://localhost:9200/
INFO:config:- username                = elastic
INFO:config:- password                = changeme
INFO:config:- tfhub_embedding_url     = https://tfhub.dev/google/universal-sentence-encoder/4
INFO:config:- tfhub_cache_dir         = /var/tmp/tfhub_modules
INFO:config:----------------------------------------------
INFO:config:----------------------------------------------
INFO:config:Elasticsearch indexes:
INFO:config:- askextension index      = askextension
INFO:config:- combined index          = combined
INFO:config:- problem index           = problem
INFO:config:- information index       = information
INFO:config:----------------------------------------

In [6]:
# ---------------------------------------- Title-question field embedding
c_list      = df['title_question'].tolist()
c_vectors   = config.embed(c_list).numpy().tolist()
df['title_question_vector'] = c_vectors
# ---------------------------------------- Tags field embedding
tags = []
for ts in df['tags']:
    if len(ts) == 0:
        tags.append([])
    else:
        ts_vector = config.embed(ts).numpy().tolist()
        tags.append([{'tag': ts[i], 'tag_vector': ts_vector[i]} for i in range(len(ts))])
df['tags'] = tags
# ---------------------------------------- Answers field embedding
tmp = df['answer']
for i, _ in enumerate(tmp):
    if len(tmp.iloc[i]) > 0:
        c_vectors = config.embed([i['response'] for i in tmp.iloc[i]]).numpy().tolist()
        for j, _ in enumerate(tmp.iloc[i]):
            tmp.iloc[i][j]['response_vector'] = c_vectors[j]

df.sample(5)

Unnamed: 0,ticket_no,url,created,tags,title,question,title_question,answer,title_question_vector
131738,610931,https://ask2.extension.org/kb/faq.php?id=610931,2020-01-20 19:33:23,"[{'tag': 'daphne', 'tag_vector': [-0.028326660...",A disease on daphne.,This is a picture of my sister's Daphne. The l...,A disease on daphne. This is a picture of my s...,"[{'response': 'Thank you for your question, Si...","[-0.021090388298034668, 0.05817939713597298, -..."
105774,473423,https://ask2.extension.org/kb/faq.php?id=473423,2018-07-26 00:27:41,[],those who have worked less than 40 quarters.,those who have worked for less than 40 quarter...,those who have worked less than 40 quarters. t...,[{'response': 'In order to receive social secu...,"[-0.01615593209862709, -0.04780679568648338, -..."
69628,358972,https://ask2.extension.org/kb/faq.php?id=358972,2016-09-02 18:52:24,"[{'tag': 'trees and shrubs', 'tag_vector': [-0...",What is this tree?,This tree gives seed pods in the spring. Some ...,What is this tree? This tree gives seed pods i...,[{'response': 'Based on the almost feathery lo...,"[-0.007034368813037872, -0.0024008476175367832..."
48935,295294,https://ask2.extension.org/kb/faq.php?id=295294,2016-01-01 23:26:55,"[{'tag': 'plant identification', 'tag_vector':...",What vine is this?,It's very fast growing,What vine is this? It's very fast growing,[{'response': 'Pandorea jasminoides variegated...,"[0.0050316727720201015, -0.004384475760161877,..."
102868,464803,https://ask2.extension.org/kb/faq.php?id=464803,2018-06-23 20:45:53,"[{'tag': 'insect identification', 'tag_vector'...",What bug is this.,Hi I live in cali and found this on my bedroom...,What bug is this. Hi I live in cali and found ...,[{'response': 'I cannot be sure from the image...,"[-0.06687523424625397, 0.08102244138717651, 0...."


In [7]:
askextension_mapping = {
    "settings": {
    "number_of_shards": 1
    },
    "mappings": {
        "properties": {
            "ticket_no" : {"type": "text"},
            "url"       : {"type": "text"},
            "created"   : {"type": "text"},
            "tags"      : {
                "type"      : "nested",
                "properties": {
                    "tag"       : {"type": "text"},
                    "tag_vector": {"type": "dense_vector", "dims": 512}
                }
            },
            "title"     : {"type": "text"},
            "question"  : {"type": "text"},
            "title_question"        : {"type": "text"},
            "title_question_vector" : {"type": "dense_vector", "dims": 512},
            "answer"    : {
                "type"      : "nested",
                "properties": {
                    "response"  : {"type": "text"},
                    "author"    : {"type": "text"},
                    "response_vector"   : {"type": "dense_vector", "dims": 512}
                }
            }
        }
    }
}


df_json = df.to_dict('records')

In [8]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk

from collections import deque


es_client = Elasticsearch([config.es_host], http_auth=(config.es_username, config.es_password))

es_client.indices.delete(
    index   = config.es_askextension_index, 
    ignore  = 404)
es_client.indices.create(
    index       = config.es_askextension_index              , 
    settings    = config.ES_ASKEXTENSION_MAPPING['settings'],   
    mappings    = config.ES_ASKEXTENSION_MAPPING['mappings'])        
deque(parallel_bulk(es_client, actions = df_json, index = config.es_askextension_index), maxlen = 0)

es_client.indices.refresh()

INFO:elasticsearch:GET http://localhost:9200/ [status:200 request:0.004s]
INFO:elasticsearch:DELETE http://localhost:9200/askextension [status:200 request:0.040s]
INFO:elasticsearch:PUT http://localhost:9200/askextension [status:200 request:0.062s]
INFO:elasticsearch:POST http://localhost:9200/askextension/_bulk [status:200 request:1.373s]
INFO:elasticsearch:POST http://localhost:9200/askextension/_bulk [status:200 request:1.611s]
INFO:elasticsearch:POST http://localhost:9200/askextension/_bulk [status:200 request:0.389s]
INFO:elasticsearch:POST http://localhost:9200/askextension/_bulk [status:200 request:1.250s]
INFO:elasticsearch:POST http://localhost:9200/askextension/_bulk [status:200 request:1.324s]
INFO:elasticsearch:POST http://localhost:9200/_refresh [status:200 request:0.100s]




{'_shards': {'total': 24, 'successful': 17, 'failed': 0}}