# Ask Extension data EDA

In [1]:
import pandas as pd
import numpy as np

import json
import os

from string import punctuation as pn

In [2]:
PATH = '../data/askextension/2020-08-20/'
FILE_NAMES = [PATH + f for f in sorted(os.listdir(PATH))]

## Data

Data constists 4 files:
- 2012-2014.json
- 2014-2016.json
- 2016-2018.json
- 2018-2020.json

And looks following way:

In [7]:
with open(FILE_NAMES[0]) as f:
    f = json.load(f)
    print(json.dumps(f[0], indent = 2))

{
  "faq-id": 3,
  "title": "When can I plant blue spruce trees in Colorado? #109900",
  "created": "2012-12-03 15:53:47",
  "updated": "2012-12-03 17:47:21",
  "tags": [
    "trees and shrubs"
  ],
  "state": "Colorado",
  "county": "El Paso County",
  "question": "I need to plant two blue spruce trees that are currently in 24\" diameter plastic containers with drain holes in the bottom sides.\n\nLocation: northeast side of Colorado Springs.\n\nThese trees are currently outside on the patio and susceptible to the wind and sun. The trees were watered this past Saturday and seem to be healthy.\n\nQuestion: Can these trees be planted now? Currently the soil is not frozen and night time temps are 35 to 40 degrees.\n\nI have downloaded and read CMG GardenNotes #633 as a reference.\n\nAny advice would be greatly appreciated. ",
  "answer": {
    "1": {
      "response": "Jerry, \nyou can plant them now (a) OR temporarily \"plant\" them, still in containers, so that roots have some insulatio

It is a list of dictionary objects with following fields:
- `faq-id` - ID of the ticket
- `title` - title of the ticket along ID of the ticket (__other__ ID)
- `created` - ticket creating date
- `updated` - ticket last update date
- `tags` - list of tags
- `state` - state ticket was created in
- `county` - county ticket was created in
- `question` - question that has been posted
- `answer` - response lists presented in numbered dictionary data type

## ETL

Following transformations are performed:
- merge source into single JSON file
- filter out ticket from all states except _California_
- retrive ticket ID from title
- create URL of ticket
- clean long text columns - `title`, `question`, `answer`
- create new column out of `title` and `question`
- remove tickets that have less than 3 words in `title` and `quesiton` combined
- limit the number of characters to 300 for `title`, `question`, `answer` columns

Save result to ES JSON format to be injected to ES.

## EDA

TO BE DONE

In [55]:
df = pd.read_json("../data/transformed/askextension_transformed.json")

In [56]:
tmp = df.iloc[:10]
tmp

Unnamed: 0,index,faq-id,ticket-no,url,created,updated,state,county,title,question,title-question,answer
0,0,93908,437288,https://ask2.extension.org/kb/faq.php?id=437288,2018-01-02 12:17:46,2018-01-04 18:39:35,California,Monterey County,Best coop for backyard chickens.,Can you suggest a coop for my backyard Id like...,Best coop for backyard chickens. Can you sugge...,"[There is no ""best"" chicken coop. When housing..."
1,1,93903,437706,https://ask2.extension.org/kb/faq.php?id=437706,2018-01-09 19:01:42,2018-01-09 19:46:37,California,Los Angeles County,Small black bugs with little white colored wings.,Woke up this morning and found these small ins...,Small black bugs with little white colored win...,[I cannot identify the insect from the picture...
2,2,93889,437799,https://ask2.extension.org/kb/faq.php?id=437799,2018-01-11 03:52:59,2018-01-11 15:37:51,California,Los Angeles County,Browning Sequoia Sappling.,Hello! I have a sequoia sappling that has some...,Browning Sequoia Sappling. Hello! I have a seq...,[Thank you for your sequioa question. Browning...
3,3,93987,437923,https://ask2.extension.org/kb/faq.php?id=437923,2018-01-13 01:33:18,2018-01-16 00:21:06,California,Siskiyou County,What type of snake is this?,"Hello, I was recently out walking near the cre...","What type of snake is this? Hello, I was recen...",[Thank you for your question. The snake in you...
4,4,93840,438008,https://ask2.extension.org/kb/faq.php?id=438008,2018-01-14 20:48:29,2018-01-21 01:02:27,California,Los Angeles County,Name this spider.,I have this awesome spider necklace that I got...,Name this spider. I have this awesome spider n...,"[Dear Client, I am not certain what species of..."
5,5,93887,438028,https://ask2.extension.org/kb/faq.php?id=438028,2018-01-15 01:22:04,2018-01-23 15:53:02,California,Los Angeles County,fruit tree lychee.,is chicken manure good for lychee trees,fruit tree lychee. is chicken manure good for ...,[Thank you for your question about fertilizing...
6,6,94171,438120,https://ask2.extension.org/kb/faq.php?id=438120,2018-01-16 16:48:29,2018-01-22 20:23:02,California,Lake County,Oregon State's Agriculture department farm.,I have been accepted into Oregon State Univers...,Oregon State's Agriculture department farm. I ...,"[I assume you mean an Animal Sciences major, w..."
7,7,94315,438222,https://ask2.extension.org/kb/faq.php?id=438222,2018-01-17 23:54:35,2018-01-25 22:51:23,California,Santa Clara County,Do phorid flies penetrate human skin?,"If we do in fact have phorid flies, they seem ...",Do phorid flies penetrate human skin? If we do...,[The best way to find a solution to your probl...
8,8,94300,439233,https://ask2.extension.org/kb/faq.php?id=439233,2018-01-31 11:26:46,2018-01-31 17:18:18,California,Contra Costa County,Plant comparability.,Hello I am planning a growing desert roses and...,Plant comparability. Hello I am planning a gro...,[Thank you for your question about desert rose...
9,9,94302,439353,https://ask2.extension.org/kb/faq.php?id=439353,2018-02-01 17:00:27,2018-02-01 18:39:00,California,San Diego County,Splitting oranges.,"I have a 50 year old naval orange tree , they ...",Splitting oranges. I have a 50 year old naval ...,[Thank you for your question about the splitti...


In [57]:
import config
import index

es_client = config.es_client
embed = config.embed

In [58]:
import json
from typing import Dict

import numpy as np
import pandas as pd
from elasticsearch.helpers import parallel_bulk

from collections import deque

def create_index(index_name: str, mapping: Dict) -> None:
    '''
    Create an ES index.
    :param index_name: Name of the index.
    :param mapping: Mapping of the index
    '''
    es_client.indices.delete(index = index_name, ignore = 404)        
    es_client.indices.create(index=index_name, ignore=400, body=mapping)

def populate_index(path: str, index_name: str, limit = 10) -> None:
    '''
    Populate an index from a CSV file.
    :param path: The path to the CSV file.
    :param index_name: Name of the index to which documents should be written.
    '''
    df = pd.read_json(path).replace({np.nan: None}).iloc[:limit]
    df.loc[:, 'title-question-vector'] = _get_embed(df['title-question'])
    df = df.to_dict('records')
    
    # self.es_client.indices.delete(index = 'test', ignore = 404)
    deque(parallel_bulk(es_client, df, index = index_name), maxlen = 0)
    es_client.indices.refresh()
    print('Done')
    print(es_client.cat.count(index_name, params={"format": "json"})[0]['count'])

def _get_embed(series: pd.Series) -> list:
    '''
    Populate an index from a CSV file.
    :param path: The path to the CSV file.
    :param index_name: Name of the index to which documents should be written.
    '''
    
    res = embed(series.tolist()).numpy()
    res = list(res)
    return res


def query(text: str) -> :

    q_vector = embed([query]).numpy()[0]
    script_query = {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.query_vector, doc['title-question-vector']) + 1.0",
                "params": {"query_vector": q_vector}
            }
        }
    }

    response = es_client.search(
        index=index.ASKEXTENSION_INDEX,
        body={
            "size": 3,
            "query": script_query,
            "_source": {"includes": ["title", "question",]}
        }
    )

In [59]:
create_index(index.ASKEXTENSION_INDEX, index.ASKEXTENSION_MAPPING)
populate_index(config.ASKEXTENSION_FILE_RESULT, index.ASKEXTENSION_INDEX, limit = 10000)

  es_client.indices.create(index=index_name, ignore=400, body=mapping)
2022-02-14 21:26:06.080776: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 218991360 exceeds 10% of free system memory.


Done
2171


  print(es_client.cat.count(index_name, params={"format": "json"})[0]['count'])


In [60]:
es_client.cat.count(index.ASKEXTENSION_INDEX, params={"format": "json"})

  es_client.cat.count(index.ASKEXTENSION_INDEX, params={"format": "json"})


[{'epoch': '1644852396', 'timestamp': '15:26:36', 'count': '2171'}]

In [61]:
df = pd.read_json(config.ASKEXTENSION_FILE_RESULT).replace({np.nan: None}).iloc[:10]
print(df['title'][0])
resp = es_client.search(index=index.ASKEXTENSION_INDEX, query={"match": {"title": "Best coop for backyard chickens.	"}})
resp['hits']['hits']

Best coop for backyard chickens.


[{'_index': 'askextension',
  '_type': '_doc',
  '_id': 'mHLX-H4BQevW0jV3TbAZ',
  '_score': 27.37522,
  '_source': {'index': 0,
   'faq-id': 93908,
   'ticket-no': 437288,
   'url': 'https://ask2.extension.org/kb/faq.php?id=437288',
   'created': '2018-01-02 12:17:46',
   'updated': '2018-01-04 18:39:35',
   'state': 'California',
   'county': 'Monterey County',
   'title': 'Best coop for backyard chickens.',
   'question': 'Can you suggest a coop for my backyard Id like to have 3 to 4 chicken',
   'title-question': 'Best coop for backyard chickens. Can you suggest a coop for my backyard Id like to have 3 to 4 chicken',
   'answer': ['There is no "best" chicken coop. When housing chickens you simply need a place that the chickens can get away from the weather, the don\'t like wind and rain. The housing system must be able to keep predators out, both wild animals and domestic dogs, the birds prefer to sleep on roosts that are off t'],
   'title-question-vector': [-0.002772571984678507,


In [22]:
print(df['question'][3])

Hello, I was recently out walking near the creek and noticed a snake Ive never seen before in this area. I live in Northern California and have never seen this type of snake before.


In [62]:
query = "My leaves were underdeveloped"
q_vector = embed([query]).numpy()[0]
script_query = {
    "script_score": {
        "query": {"match_all": {}},
        "script": {
            "source": "cosineSimilarity(params.query_vector, 'title-question-vector') + 1.0",
            "params": {"query_vector": q_vector}
        }
    }
}

In [64]:
response = es_client.search(
    index=index.ASKEXTENSION_INDEX,
    body={
        "size": 3,
        "query": script_query,
        "_source": {"includes": ["title", "question",]}
    }
)
response['hits']['hits']

  response = es_client.search(


[{'_index': 'askextension',
  '_type': '_doc',
  '_id': 'SXLX-H4BQevW0jV3TrMq',
  '_score': 1.4095367,
  '_source': {'question': 'Why are my bean plants leaves sagging i just planted them a few days ago in my green hous',
   'title': 'Why are my bean plants le.'}},
 {'_index': 'askextension',
  '_type': '_doc',
  '_id': 'enLX-H4BQevW0jV3TrQr',
  '_score': 1.4051611,
  '_source': {'question': 'My fushias leaves were suddenly covered with a light brown haze over almost all the leaves. tHe plant is dying. What can i do? It has been a hardy fushia for years. Thank',
   'title': 'Fushia.'}},
 {'_index': 'askextension',
  '_type': '_doc',
  '_id': 'fXLX-H4BQevW0jV3TrMq',
  '_score': 1.4021769,
  '_source': {'question': 'WHY WOULD LEAVES TURN YELLOW FOR A (HYBICKA',
   'title': 'WHY WOULD LEAVES TURN YEL.'}}]

{'took': 2,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 10, 'relation': 'eq'},
  'max_score': 1.6768942,
  'hits': [{'_index': 'askextension',
    '_type': '_doc',
    '_id': 'j3KL-H4BQevW0jV3wbDS',
    '_score': 1.6768942,
    '_source': {'question': 'Hello, I was recently out walking near the creek and noticed a snake Ive never seen before in this area. I live in Northern California and have never seen this type of snake before.',
     'title': 'What type of snake is this?'}},
   {'_index': 'askextension',
    '_type': '_doc',
    '_id': 'kHKL-H4BQevW0jV3wbDS',
    '_score': 1.2319449,
    '_source': {'question': "I have this awesome spider necklace that I got a little while ago. But when im wearing it, people often ask me what kind of spider it is. I've done research but I haven't found much. I think it might be a black and yellow garden spider but im not sure. Can anyone tell me",
     'title': 'Name this s

In [23]:
from ruamel import yaml

In [24]:
es_config = (yaml.safe_load(open('es_config.yml', 'r')) or {})

In [25]:
es_config

{'host': 'http://localhost:9200/',
 'tfhub-embdedding-url': 'https://tfhub.dev/google/universal-sentence-encoder/4',
 'tfhub-cache-dir': '',
 'askextension-index': 'askextension',
 'askextension-mapping': '{\n  "settings": {\n    "number_of_shards": 1\n  },\n  "mappings": {\n    "properties": {\n      "faq-id"    : {"type": "integer"},\n      "ticket-no" : {"type": "text"},\n      "url"       : {"type": "text"},\n      "created"   : {"type": "text"},\n      "updated"   : {"type": "text"},\n      "state"     : {"type": "text"},\n      "county"    : {"type": "text"},\n      "title"     : {"type": "text"},\n      "question"  : {"type": "text"},\n      "title-question"        : {"type": "text"},\n      "title-question-vector" : {"type": "dense_vector", "dims": 512},\n      "answer"    : {"type": "text"}\n    }\n  }\n}\n'}

In [26]:
es_config.get('askextension-mapping')

'{\n  "settings": {\n    "number_of_shards": 1\n  },\n  "mappings": {\n    "properties": {\n      "faq-id"    : {"type": "integer"},\n      "ticket-no" : {"type": "text"},\n      "url"       : {"type": "text"},\n      "created"   : {"type": "text"},\n      "updated"   : {"type": "text"},\n      "state"     : {"type": "text"},\n      "county"    : {"type": "text"},\n      "title"     : {"type": "text"},\n      "question"  : {"type": "text"},\n      "title-question"        : {"type": "text"},\n      "title-question-vector" : {"type": "dense_vector", "dims": 512},\n      "answer"    : {"type": "text"}\n    }\n  }\n}\n'

In [27]:
import json

json.loads(es_config.get('askextension-mapping'))

{'settings': {'number_of_shards': 1},
 'mappings': {'properties': {'faq-id': {'type': 'integer'},
   'ticket-no': {'type': 'text'},
   'url': {'type': 'text'},
   'created': {'type': 'text'},
   'updated': {'type': 'text'},
   'state': {'type': 'text'},
   'county': {'type': 'text'},
   'title': {'type': 'text'},
   'question': {'type': 'text'},
   'title-question': {'type': 'text'},
   'title-question-vector': {'type': 'dense_vector', 'dims': 512},
   'answer': {'type': 'text'}}}}