# ETL and ingestion of data

In [None]:
import os
import re
import json


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pathlib import Path

COLOR = 'white'
plt.rcParams['text.color'       ] = COLOR
plt.rcParams['text.color'       ] = COLOR
plt.rcParams['axes.labelcolor'  ] = COLOR
plt.rcParams['xtick.color'      ] = COLOR
plt.rcParams['ytick.color'      ] = COLOR

# IPM data - December 2021 Scrape

In [None]:
_PATH = Path('../data/uc-ipm/scrape_cleaned_Dec2021')
DATA_FILE_NAMES = sorted(_PATH.iterdir())
[data_file.name for data_file in DATA_FILE_NAMES]

The list of files should be as following:
```python
['exoticPests.json',
 'fruitItems_new.json',
 'fruitVeggieEnvironItems_new.json',
 'pestDiseaseItems_new.json',
 'plantFlowerItems.json',
 'turfPests.json',
 'veggieItems_new.json',
 'weedItems.json']
```

The corresponding ETL for these sources (links):
* [`exoticPests.json`](#exoticpestsjson)
* [`fruitItems_new.json`](#fruititems_newjson)
* [`fruitVeggieEnvironItems_new.json`](#fruitveggieenvironitems_newjson)
* [`pestDiseaseItems_new.json`](#pestdiseaseitems_newjson)
* [`plantFlowerItems.json`](#plantfloweritemsjson)
* [`turfPests.json`](#turfpestsjson)
* [`veggieItems_new.json`](#veggieitems_newjson)
* [`weedItems.json`](#weeditemsjson)

## ETL of data

In [None]:
import html

def clean(text):
    '''
    Fix encodings and remove escape and redundant whitespace characters from text.
    '''
    text = text.encode('ascii', 'ignore').decode()
    text = re.sub(r'\s+', ' ', text).strip()
    text = html.unescape(text)
    return text


def get_text_from_fields(row, fields):
    row_items = []
    for field in fields:
        text = row[field]

        if len(text) == 0:
            continue
        
        row_items.append({
            'text': clean(text),
            'field': field,
            'name': field.replace('_', ' ').capitalize(),
            'im_src': ''
        })

    return row_items

def get_text_from_list_field(row, field, subfield, title = False, im_src = None):
    row_items = []
    for item in row[field]:
        text = item[subfield]

        if len(text) == 0:
            continue

        if title:
            text = row['title'] + ' - ' + text
        else: 
            text = text
        
        if im_src and len(item[im_src]) > 0:
            src = item[im_src]
        else: 
            src = ''
        
        row_items.append({
            'text': clean(text),
            'field': field,
            'name': field.replace('_', ' ').capitalize(),
            'im_src': src
        })
    
    return row_items

def get_images(row, field, im_src):
    row_images = []
    for item in row[field]:
        if len(item[im_src]) > 0:
            row_images.append(item[im_src])
    
    return row_images

def transform_data(df, list_fields, list_text_fields, image_fields, limit = None):

    if limit:
        df = df.sample(limit).copy(deep=True)
    
    df['source'] = 'ucipm'
    
    for field in list_fields:
        df[field] = df[field].apply(lambda d: d if isinstance(d, list) else [])
    
    cols = [col for col in df.columns[df.applymap(lambda x: isinstance(x, str)).all(0)] if col not in ['url', 'source']]

    texts = []
    images = []
    for _, row in df.iterrows():
        row_texts = []
        row_images = []
        
        row_texts.extend(get_text_from_fields(row, cols))

        for field, subfield, concat_title, im_src in list_text_fields:
            row_texts.extend(get_text_from_list_field(row, field, subfield, title=concat_title, im_src=im_src))
        
        for field, subfield in image_fields:
            row_images.extend(get_images(row, field, subfield))

        texts.append(row_texts)
        images.append(row_images)
    
    df['texts'] = texts
    df['images'] = images

    df = df.loc[:, ['source', 'url', 'title', 'texts', 'images']]

    return df

In [None]:
final_df = pd.DataFrame()

FILE_NAME = 'exoticPests.json'
print(f'Transforming "{FILE_NAME}"...')
df = pd.read_json(Path.joinpath(_PATH, FILE_NAME))
df.rename(columns = {'name': 'title'}, inplace = True)

df = transform_data(
    df, 
    list_fields=['images', 'related_links'],
    list_text_fields=[
        ('images', 'caption', True, 'src'),
        ('related_links', 'text', True, None),        
    ],
    image_fields=[('images', 'src')],
    limit=30
)
final_df = pd.concat([final_df, df], axis=0, ignore_index=True)

FILE_NAME = 'fruitItems_new.json'
print(f'Transforming "{FILE_NAME}"...')
df = pd.read_json(Path.joinpath(_PATH, FILE_NAME))
df.rename(columns = {'name': 'title'}, inplace = True)

df = transform_data(
    df, 
    list_fields=['cultural_tips', 'pests_and_disorders'],
    list_text_fields=[
        ('cultural_tips', 'tip', True, None),
        ('pests_and_disorders', 'problem', True, None),        
    ],
    image_fields=[],
    limit=10
)
final_df = pd.concat([final_df, df], axis=0, ignore_index=True)

FILE_NAME = 'fruitVeggieEnvironItems_new.json'
print(f'Transforming "{FILE_NAME}"...')
df = pd.read_json(Path.joinpath(_PATH, FILE_NAME))
df.rename(columns = {'name': 'title',}, inplace = True)

df = transform_data(
    df, 
    list_fields=['images'],
    list_text_fields=[
        ('images', 'caption', True, 'src'),        
    ],
    image_fields=[('images', 'src')],
    limit=30
)
final_df = pd.concat([final_df, df], axis=0, ignore_index=True)

FILE_NAME = 'pestDiseaseItems_new.json'
print(f'Transforming "{FILE_NAME}"...')
df = pd.read_json(Path.joinpath(_PATH, FILE_NAME))
df.rename(columns = {'name': 'title',}, inplace = True)

df = transform_data(
    df, 
    list_fields=['images'],
    list_text_fields=[
        ('images', 'caption', True, 'src'),        
    ],
    image_fields=[('images', 'src')],
    limit=30
)
final_df = pd.concat([final_df, df], axis=0, ignore_index=True)

FILE_NAME = 'plantFlowerItems.json'
print(f'Transforming "{FILE_NAME}"...')
df = pd.read_json(Path.joinpath(_PATH, FILE_NAME))
df.rename(columns = {'name': 'title',}, inplace = True)

df = transform_data(
    df, 
    list_fields=['images', 'pests_and_disorders'],
    list_text_fields=[
        ('images', 'caption', True, 'src'),
        ('pests_and_disorders', 'problem', True, None),        
    ],
    image_fields=[('images', 'src')],
    limit=30
)
final_df = pd.concat([final_df, df], axis=0, ignore_index=True)

FILE_NAME = 'turfPests.json'
print(f'Transforming "{FILE_NAME}"...')
df = pd.read_json(Path.joinpath(_PATH, FILE_NAME))
df.rename(columns = {'name': 'title',}, inplace = True)

df = transform_data(
    df, 
    list_fields=['images'],
    list_text_fields=[
        ('images', 'caption', True, 'src'),        
    ],
    image_fields=[('images', 'src')],
    limit=30
)
final_df = pd.concat([final_df, df], axis=0, ignore_index=True)

FILE_NAME = 'veggieItems_new.json'
print(f'Transforming "{FILE_NAME}"...')
df = pd.read_json(Path.joinpath(_PATH, FILE_NAME))
df.rename(columns = {'name'  : 'title'}, inplace = True)

df = transform_data(
    df, 
    list_fields=['images', 'pests_and_disorders'],
    list_text_fields=[
        ('images', 'caption', True, 'src'),
        ('pests_and_disorders', 'problem', True, None),        
    ],
    image_fields=[('images', 'src')],
    limit=30
)
final_df = pd.concat([final_df, df], axis=0, ignore_index=True)

FILE_NAME = 'weedItems.json'
print(f'Transforming "{FILE_NAME}"...')
df = pd.read_json(Path.joinpath(_PATH, FILE_NAME))
df.rename(columns = {'name': 'title',}, inplace = True)

df = transform_data(
    df, 
    list_fields=['images'],
    list_text_fields=[
        ('images', 'caption', True, None),        
    ],
    image_fields=[],
    limit=30
)

final_df = pd.concat([final_df, df], axis=0, ignore_index=True)

print(f'Final shape is :{final_df.shape}')

# pestsExotic().to_json('test.json', orient='records')

### Loading embedding module

In [None]:
import sys

sys.path.insert(1, os.path.realpath(os.path.pardir))

os.environ['STAGE'          ] = 'dev'
os.environ['ES_USERNAME'    ] = 'elastic'
os.environ['ES_PASSWORD'    ] = 'changeme'
os.environ['TF_CACHE_DIR'   ] = '/var/tmp/models'
## select the environment for ingestion
os.environ['ES_HOST'    ] = 'http://localhost:9200/'
# os.environ['ES_HOST'    ] = 'https://dev.es.chat.ask.eduworks.com/'
# os.environ['ES_HOST'    ] = 'https://qa.es.chat.ask.eduworks.com/'

import config

In [None]:
import importlib
importlib.reload(config)

## Embedding data

In [None]:
from spacy.lang.en import English 

nlp = English()
nlp.add_pipe('sentencizer')

def get_chunks(texts, max_seq_length):
    texts_new = []
    for item in texts:
        text, field, name, im_src = item['text'], item['field'], item['name'], item['im_src']
        
        doc = nlp(text)
        sents = [sent for sent in doc.sents]

        start, end = 0, 1
        while end != len(sents):
            if start == end:
                end +=1
            elif len(' '.join([sent.text for sent in sents[start:end+1]])) > max_seq_length:
                texts_new.append({
                    'text': ' '.join([sent.text for sent in sents[start:end]]),
                    'field': field,
                    'name': name,
                    'im_src': im_src
                })
                start += 1
            else:
                end += 1
        texts_new.append({
            'text': ' '.join([sent.text for sent in sents[start:end]]),
            'field': field,
            'name': name,
            'im_src': im_src
        })
    return texts_new

def embed_data(df, max_seq_size):
    print(f'STARTING TRANSFORMING - MAX_SEQ_SIZE - {max_seq_size}')
    df_texts = []
    for i, row in df.iterrows():
        row_texts = []
        texts = row['texts']
        df_texts.append(get_chunks(texts, max_seq_size))
        if (i+1) % 500 == 0:
            print(f'Finished transforming of {i+1} rows of dataframe')
        
        
    print(f'Finished transforming of {i+1} rows of dataframe')
    print(f'FINISHED TRANSFORMING')

    texts = [item['text'] for row in df_texts for item in row]

    BATCH_SIZE = 64

    print(f'STARTING EMBEDDING - BATCH_SIZE = {BATCH_SIZE}')
    print(f'Number of texts to be embedded = {len(texts)}')

    # TF HUB model
    # vectors   = config.embed(texts_modified).numpy().tolist()
        
    # Sentence Encoder model        
    vectors = config.embed.encode(
        sentences           = texts     ,
        batch_size          = BATCH_SIZE,
        show_progress_bar   = True
    ).tolist()

    index = 0
    for i, row in enumerate(df_texts):
        for i1, item in enumerate(row):
            item['vector'] = vectors[index]
            item['field'] = item['field'] + str(i1)
            assert texts[index] == item['text']
            index += 1

    print(f'FINISHED EMBEDDING')

    df['texts'] = df_texts
    print(f'The number of vectors to be ingested: {len([item["vector"] for row in df["texts"] for item in row])}', end='\n\n')

    return df

In [None]:

MAX_SEQ_SIZE = config.embed.max_seq_length

df = embed_data(final_df, MAX_SEQ_SIZE)

## Ingesting data into ES

__Final mapping__
```json
{
    # mandatory fields
    "url"       : "url",                            # Main URL
    "source"    : "ucipm|aekb|okstate|orstate",     # Source Dataset
    "title"     : "title",                          # Title of data item
    "images"    : "image_srcs",                     # List of images sources if available
    "texts"   : [
        {
        "field" : "field_name_and_index",           # Name of the field
        "name"  : "tab_name",                       # Text for tab text
        "im_src": "image_source",                   # If there is image to this text
        "text"  : "text_of_vector",                 # Text of vector
        "vector": "dense_vector",                   # Embedding vector
        },
        ...
    ]
}
```

In [None]:
# Different embedding sizes depending on the models
# VECTOR_SIZE     = 384
# VECTOR_SIZE     = 512
VECTOR_SIZE     = 768
MAX_STRING_SIZE = 32766

mapping  = {
    "settings": {"number_of_shards": 2, "number_of_replicas": 1},
    "mappings": {
        "dynamic"       : "false"   ,
        "date_detection": "false"   ,
        "_source"   : {"enabled": "true"},
        "properties": {
            "source"        : {"type": "keyword", "index": "true" , "ignore_above": MAX_STRING_SIZE},
            "url"           : {"type": "keyword", "index": "true" , "ignore_above": MAX_STRING_SIZE},

            "title"         : {"type": "keyword", "index": "false", "ignore_above": MAX_STRING_SIZE},
            "images"        : {"type": "keyword", "index": "false", "ignore_above": MAX_STRING_SIZE},
            "vectors"       : {
                "type"      : "nested",
                "properties": {
                    "vector": {
                        "type": "dense_vector", 
                        "dims": VECTOR_SIZE
                    },
                    "field" : {"type": "keyword", "index": "false", "ignore_above": MAX_STRING_SIZE},
                    "name"  : {"type": "keyword", "index": "false", "ignore_above": MAX_STRING_SIZE},
                    "im_src": {"type": "keyword", "index": "false", "ignore_above": MAX_STRING_SIZE},
                    "text"  : {"type": "keyword", "index": "false", "ignore_above": MAX_STRING_SIZE},
                }
            }
        }
    }
}

In [None]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk

from collections import deque

# increase the timeout if necessary
es_client = Elasticsearch([config.es_host], http_auth=(config.es_username, config.es_password), timeout = 20)

es_client.indices.delete(
    index   = config.es_combined_index, 
    ignore  = 404)
es_client.indices.create(
    index       = config.es_combined_index  , 
    settings    = mapping['settings']       , 
    mappings    = mapping['mappings']       )
# play with chunk size parameter for timed out problem
final_json = df.to_dict(orient = 'records')
deque(parallel_bulk(es_client, actions = final_json, index = config.es_combined_index, max_chunk_bytes = 5 * 1024 * 1024), maxlen = 0)

es_client.indices.refresh()

# IPM data - April 2022 Scrape

In [None]:
_PATH = Path('../data/uc-ipm/scrape_cleaned_Apr2022/')
DATA_FILE_NAMES = sorted(_PATH.iterdir())
[data_file.name for data_file in DATA_FILE_NAMES]

The list of files should be as following:
```python
['FruitVegCulturalItems.json',
 'GardenControlsPestItems.json',
 'GardenControlsPesticideItems.json',
 'PestNotes.json',
 'QuickTips.json',
 'Videos.json',
 'WeedIdItems.json']
```

The corresponding ETL for these sources (links):
* [`FruitVegCulturalItems.json`](#fruitvegculturalitemsjson)
* [`GardenControlsPestItems.json`](#gardercontolspestitemsjson)
* [`GardenControlsPesticideItems.json`](#gardencontrolspesticideitemsjson)
* [`PestNotes.json`](#pestnotesjson)
* [`QuickTips.json`](#quicktipsjson)
* [`Videos.json`](#videosjson)
* [`WeedIdItems.json`](#weediditemsjson)

## ETL of data

In [None]:
def transform_table(row):
    '''
    Rename the 'tips_table' key values to title with title and header concatenation.
    '''
    if len(row['tips_table']) > 0:
        items = row['tips_table']
        assert 'header' in items[0] 
        header_title = row['title'] + ' - ' + items[0]['header']
        row['tips_table'] = header_title
    else:
        row['tips_table'] = ''

def transform_pesticide(row):
    '''
    Merge pesticide subfield into main field - information.
    '''
    information = row['information'][0]
    texts = []
    for k, v in information.items():
        texts.append(k.replace('_', ' ').capitalize() + ': ' + v + '. ')
    row['information'] = '. '.join(texts)

In [None]:
final_df = pd.DataFrame()

FILE_NAME = 'FruitVegCulturalItems.json'
print(f'Transforming "{FILE_NAME}"...')
df = pd.read_json(Path.joinpath(_PATH, FILE_NAME))
df.rename(columns = {'name': 'title'}, inplace = True)
df['tips_table'] = df['tips_table'].apply(lambda d: d if isinstance(d, list) else [])
df.apply(lambda r: transform_table(r), axis = 1)


df = transform_data(
    df, 
    list_fields=['images'],
    list_text_fields=[
        ('images', 'caption', True, 'src'),
    ],
    image_fields=[('images', 'src')],
    limit=30
)
final_df = pd.concat([final_df, df], axis=0, ignore_index=True)

FILE_NAME = 'GardenControlsPestItems.json'
print(f'Transforming "{FILE_NAME}"...')
df = pd.read_json(Path.joinpath(_PATH, FILE_NAME))
df.rename(columns = {'name': 'title'}, inplace = True)

df = transform_data(
    df, 
    list_fields=['images'],
    list_text_fields=[
        ('images', 'caption', True, 'src'),
    ],
    image_fields=[('images', 'src')],
    limit=10
)
final_df = pd.concat([final_df, df], axis=0, ignore_index=True)


FILE_NAME = 'GardenControlsPesticideItems.json'
print(f'Transforming "{FILE_NAME}"...')
df = pd.read_json(Path.joinpath(_PATH, FILE_NAME))
df['title'] = df[['active_ingredient', 'pesticide_type']].agg(' - '.join, axis=1)
df.drop(['active_ingredient', 'pesticide_type'], axis=1, inplace=True)
df.apply(lambda r: transform_pesticide(r), axis = 1)

df = transform_data(
    df, 
    list_fields=[],
    list_text_fields=[],
    image_fields=[],
    limit=10
)
final_df = pd.concat([final_df, df], axis=0, ignore_index=True)

FILE_NAME = 'PestNotes.json'
print(f'Transforming "{FILE_NAME}"...')
df = pd.read_json(Path.joinpath(_PATH, FILE_NAME))
df.rename(columns = {
    'urlPestNote'           : 'url'         ,
    'name'                  : 'title'       ,
    'descriptionPestNote'   : 'description' ,
    'lifecyclePestNote'     : 'lifecycle'   ,
    'damagePestNote'        : 'damage'      ,
    'managementPestNote'    : 'management'  ,
    'imagePestNote'         : 'images'      ,
}, inplace = True)
df.drop('tablePestNote', axis=1, inplace=True)

df = transform_data(
    df, 
    list_fields=['images'],
    list_text_fields=[
        ('images', 'caption', True, 'src'),
    ],
    image_fields=[('images', 'src')],
    limit=30
)
final_df = pd.concat([final_df, df], axis=0, ignore_index=True)

FILE_NAME = 'QuickTips.json'
print(f'Transforming "{FILE_NAME}"...')
df = pd.read_json(Path.joinpath(_PATH, FILE_NAME))
df.rename(columns = {
    'urlQuickTip'           : 'url'     ,
    'name'                  : 'title'   ,
    'contentQuickTips'      : 'content' ,
    'imageQuickTips'        : 'images'  ,
}, inplace = True)

df = transform_data(
    df, 
    list_fields=['images'],
    list_text_fields=[
        ('images', 'caption', True, 'src'),
    ],
    image_fields=[('images', 'src')],
    limit=30
)
final_df = pd.concat([final_df, df], axis=0, ignore_index=True)

FILE_NAME = 'Videos.json'
print(f'Transforming "{FILE_NAME}"...')
df = pd.read_json(Path.joinpath(_PATH, FILE_NAME))

df = transform_data(
    df, 
    list_fields=[],
    list_text_fields=[],
    image_fields=[],
    limit=30
)
final_df = pd.concat([final_df, df], axis=0, ignore_index=True)

FILE_NAME = 'WeedIdItems.json'
print(f'Transforming "{FILE_NAME}"...')
df = pd.read_json(Path.joinpath(_PATH, FILE_NAME))
df.rename(columns = {'name'  : 'title',}, inplace = True)

df = transform_data(
    df, 
    list_fields=['images'],
    list_text_fields=[
        ('images', 'caption', True, 'src'),
    ],
    image_fields=[('images', 'src')],
    limit=10
)
final_df = pd.concat([final_df, df], axis=0, ignore_index=True)

print(f'Final shape is :{final_df.shape}')
# df.to_json('test.json', orient='records')

## Embedding data

In [None]:
MAX_SEQ_SIZE = config.embed.max_seq_length

df = embed_data(final_df, MAX_SEQ_SIZE)

## Ingesting data into ES

In [None]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk

from collections import deque

# increase the timeout if necessary
es_client = Elasticsearch([config.es_host], http_auth=(config.es_username, config.es_password), timeout = 20)

# play with chunk size parameter for timed out problem
final_json = df.to_dict(orient = 'records')
deque(parallel_bulk(es_client, actions = final_json, index = config.es_combined_index, max_chunk_bytes = 5 * 1024 * 1024), maxlen = 0)

es_client.indices.refresh()

# AskExtension Data

In [None]:
_PATH = Path('../data/askextension_kb/')
DATA_FILE_NAMES = sorted(_PATH.iterdir())

print(f'List of files:\n{[data_file.name for data_file in DATA_FILE_NAMES]}')

with open(DATA_FILE_NAMES[0]) as f:
    f = json.load(f)
    print(json.dumps(f[0], indent = 2))

The list of files should be as following:
```python
['2012-2013.json', '2014-2015.json', '2016-2017.json', '2018-2019.json', '2020-1.json', '2020-2.json', '2021-1.json', '2021-2.json']
```

__NB__: We will only using tickets from California state.

## ETL

In [None]:
# Combines the data files into one and returns it.
df = pd.DataFrame()
for f in DATA_FILE_NAMES:
    df = pd.concat([df, pd.read_json(f)], ignore_index = True, axis = 0)
df.sample(5)

In [None]:
from string import punctuation as pn

def transform_answer(answer_dict):
    '''
    Convert answer field from a dictionary to a list.
    '''
    answers = [{}] * len(answer_dict)
    
    for k, v in answer_dict.items():
        # clean the response up
        v = {
            'response' : clean(v['response']),
        }
        answers[int(k) - 1] = v
    
    return answers

def transform_title(title):
    '''
    Remove question ID from title, and append '.' in the end
    if no punctuation was detected.

    Example with '#' - 437259
    Example with '...' - 437264
    '''
    title = ''.join(title.split('#')[:-1]).strip().strip('...')
    
    # add a '.' if it does not yet end with a punctuation
    title = title if (title and title[-1] in pn) else title + '.'
    
    return title

def merge_title_question(df):
    '''
    Create new column from questions and title,
    but only if it is not already exactly in the question.
    '''
    titles      = df['title'    ].tolist()
    questions   = df['question' ].tolist()
    
    tqs = [
        question
        if (title and question.startswith(title[:-1]))
        else title + " " + question
        for (title, question) in zip(titles, questions)
    ]

    return tqs

In [None]:
# Modify STATE_FILTER and MIN_WORD_COUNT variables accordingly
# STATE_FILTER    = ['California', 'Oklahoma', 'Oregon']
STATE_FILTER    = ['California']
MIN_WORD_COUNT  = 3

ASKEXTENSION_QUESTION_URL = 'https://ask2.extension.org/kb/faq.php?id='

df['source'] = 'ae-kb'
df['faq-id'] = df['faq-id'].astype(str)
df = df[df['state'].isin(STATE_FILTER)]
df['url'] = [
    f"{ASKEXTENSION_QUESTION_URL}{ticket_no}" if len(ticket_no) == 6 else ""
    for ticket_no in df['title'].str.split('#').str[-1]
]
df['ticket-no'] = [
    ticket_no if len(ticket_no) == 6 else ""
    for ticket_no in df['title'].str.split('#').str[-1]
]
df['attachments'] = df['attachments'].apply(lambda d: d if isinstance(d, list) else [])
df['attachments'] = df['attachments'].apply(lambda d: [{'src': link} for link in d])
df.rename(columns = {'faq-id': 'faq_id', 'ticket-no': 'ticket_no'}, inplace = True)

df['answers'] = df['answer'].apply(transform_answer)
df['title'] = df['title'].apply(transform_title)
df['question'] = merge_title_question(df)

if MIN_WORD_COUNT:
    df = df[df['question'].str.split().str.len() > MIN_WORD_COUNT]

df = df.loc[:, ['source', 'url', 'title', 'question', 'answers', 'attachments']]
df.reset_index(drop=True, inplace=True)

df = transform_data(
    df, 
    list_fields=['answers',],
    list_text_fields=[
        ('answers', 'response', False, None),
    ],
    image_fields=[('attachments', 'src')],
    limit=30
)

print(f'Shape of data: {df.shape}')

## Embedding data

In [None]:
MAX_SEQ_SIZE = config.embed.max_seq_length

df = embed_data(df, MAX_SEQ_SIZE)

## Ingesting data into ES

In [None]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk

from collections import deque

# increase the timeout if necessary
es_client = Elasticsearch([config.es_host], http_auth=(config.es_username, config.es_password), timeout = 20)

# play with chunk size parameter for timed out problem
final_json = df.to_dict(orient = 'records')
deque(parallel_bulk(es_client, actions = final_json, index = config.es_combined_index, max_chunk_bytes = 5 * 1024 * 1024), maxlen = 0)

es_client.indices.refresh()

# Oklahoma State University data

In [None]:
_PATH = Path('../data/okstate/fact-sheets-out-cleaner.json')
df = pd.read_json(_PATH)
df.info()

In [None]:
df.sample(5)

## ETL

In [None]:
from copy import deepcopy

def get_title_and_description(row, thumbnail = False):
    title = clean(row['title'])
    description = clean(row['description'])
    if thumbnail:
        im_src = row['thumbnail']
    else:
        im_src = ''

    texts = []
    texts.append({
        'text': title,
        'field': 'title',
        'name': 'Title',
        'im_src': im_src
    })
    if len(description) > 0:
        texts.append({
            'text': description,
            'field': 'description',
            'name': 'Description',
            'im_src': im_src
        })

    return texts

def get_contents_and_images(row, thumbnail=False):
    '''
    Transform the content field by concatenating title with header, and perform cleaning. Drop the unncessary columns.
    '''
    title = clean(row['title'])
    if thumbnail:
        im_src = row['thumbnail']
    else:
        im_src = ''

    texts = []
    images = []

    if thumbnail:
        images.append(im_src)
    for content in row['content']:
        item = {}
        header = clean(content['header'])
        if len(header) == 0 or header == 'Introduction-w/o-header':
            header = clean(title)
        else:
            header = clean(title + ' - ' + header)
        item['text'] = header
        item['field'] = 'content'
        item['name'] = 'Paragraph'
        if len(content['images']['image_urls']) > 0:
            item['im_src'] = content['images']['image_urls'][0]
        elif thumbnail:
            item['im_src'] = im_src
        else:
            item['im_src'] = ''
        texts.append(item)
        
        text = clean(content['text'])
        if len(text) > 0:
            item = deepcopy(item)
            item['text'] = clean(content['text'])
        
            texts.append(item)
        
        for url, caption in zip(content['images']['image_urls'], content['images']['image_captions']):
            item = {}
            if len(caption) > 0:
                item['text'] = clean(header + ' - ' + caption)
                item['field'] = 'image'
                item['name'] = 'Image'
                item['im_src'] = url
                texts.append(item)
            if len(url) > 0:
                images.append(url)
    
    return texts, images

In [None]:
df['source'] = 'okstate'

df['title'] = df['title'].apply(clean).fillna('Auxilary')
df.rename(columns={'link': 'url'}, inplace=True)
df.drop(columns=['author', 'pubdate', 'category', 'displaydate'], inplace=True)

texts = []
images = []
for _, row in df.iterrows():
    title_description_texts = get_title_and_description(row, thumbnail=True)
    row_texts, row_images = get_contents_and_images(row, thumbnail=True)
    title_description_texts.extend(row_texts)
    
    texts.append(row_texts)
    images.append(row_images)

df['texts'] = texts
df['images'] = images

df = df.loc[:, ['source', 'url', 'title', 'texts', 'images']]
df = df.sample(50).reset_index(drop=True)

print(f'The data shape: {df.shape}')
df.to_json('test.json', orient='records')

In [None]:
df.sample(5)

## Embedding data

In [None]:

MAX_SEQ_SIZE = config.embed.max_seq_length

df = embed_data(df, MAX_SEQ_SIZE)

## Ingesting data into ES

In [None]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk

from collections import deque

# increase the timeout if necessary
es_client = Elasticsearch([config.es_host], http_auth=(config.es_username, config.es_password), timeout = 20)

# play with chunk size parameter for timed out problem
final_json = df.to_dict(orient = 'records')
deque(parallel_bulk(es_client, actions = final_json, index = config.es_combined_index, max_chunk_bytes = 5 * 1024 * 1024), maxlen = 0)

es_client.indices.refresh()

# Oregon State University data

In [None]:
_PATH = Path('../data/orstate/OSU-Out-Cleaner.json')
df = pd.read_json(_PATH)
df.info()

In [None]:
df.sample(5)

## ETL

In [None]:
df['source'] = 'orstate'

df['title'] = df['title'].apply(clean).fillna('Auxilary')
df.rename(columns={'link': 'url'}, inplace=True)
df.drop(columns=['author', 'pubdate', 'category', 'displaydate'], inplace=True)

texts = []
images = []
for i, row in df.iterrows():
    title_description_texts = get_title_and_description(row)
    row_texts, row_images = get_contents_and_images(row)
    title_description_texts.extend(row_texts)
    
    texts.append(row_texts)
    images.append(row_images)

df['texts'] = texts
df['images'] = images

df = df.loc[:, ['source', 'url', 'title', 'texts', 'images']]
df = df.sample(50).reset_index(drop=True)

print(f'The data shape: {df.shape}')
df.to_json('test.json', orient='records')

In [None]:
df.sample(5)

## Embedding data

In [None]:
MAX_SEQ_SIZE = config.embed.max_seq_length

df = embed_data(df, MAX_SEQ_SIZE)

## Ingesting data into ES

In [None]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk

from collections import deque

# increase the timeout if necessary
es_client = Elasticsearch([config.es_host], http_auth=(config.es_username, config.es_password), timeout = 20)

# play with chunk size parameter for timed out problem
final_json = df.to_dict(orient = 'records')
deque(parallel_bulk(es_client, actions = final_json, index = config.es_combined_index, max_chunk_bytes = 5 * 1024 * 1024), maxlen = 0)

es_client.indices.refresh()

#  Jeff's method

In [None]:
CHUNK_SIZE      = 1
ROLLING_SIZE    = 3
MAX_SEQ_SIZE    = config.embed.max_seq_length
FINAL_COLS      = ['source', 'url', 'title', "vector"]


for file_name, df in data:

    texts = [r['title'] for i, r in df.iterrows()]

    BATCH_SIZE = 64

    print(f'STARTING EMBEDDING - BATCH_SIZE = {BATCH_SIZE}')
    print(f'Number of texts to be embedded = {len(texts)}')
    df['vectors'] = np.empty((len(df), 0)).tolist()

    # TF HUB model
    # vectors   = config.embed(texts_modified).numpy().tolist()
        
    # Sentence Encoder model        
    
    vectors = config.embed.encode(
        sentences           = texts     ,
        batch_size          = BATCH_SIZE,
        show_progress_bar   = True
    ).tolist()

    index = 0

    print(f'FINISHED EMBEDDING')

    df['vector'] = vectors
    df = df.loc[:, FINAL_COLS]
    print(f'The number of vectors to be ingested: {len(df["vector"])}', end='\n\n')

__Final mapping__
```json
{
    # mandatory fields
    "url"       : "url",                            # Main URL
    "source"    : "ucipm|aekb|okstate|orstate",     # Source Dataset
    "title"     : "title",                          # Title of data item
    "vectors"   : "dense_vector",                   # Embedding vector
},
```

In [None]:
# Different embedding sizes depending on the models
# VECTOR_SIZE     = 384
# VECTOR_SIZE     = 512
VECTOR_SIZE     = 768
MAX_STRING_SIZE = 32766

mapping  = {
    "settings": {"number_of_shards": 2, "number_of_replicas": 1},
    "mappings": {
        "dynamic"       : "false"   ,
        "date_detection": "false"   ,
        "_source"   : {"enabled": "true"},
        "properties": {
            "source"        : {"type": "keyword", "index": "true" , "ignore_above": MAX_STRING_SIZE},
            "url"           : {"type": "keyword", "index": "true" , "ignore_above": MAX_STRING_SIZE},

            "title"         : {"type": "keyword", "index": "false", "ignore_above": MAX_STRING_SIZE},
            "vector"        : {
                        "type": "dense_vector", 
                        "dims": VECTOR_SIZE
            }
        }
    }
}

In [None]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk

from collections import deque

# increase the timeout if necessary
es_client = Elasticsearch([config.es_host], http_auth=(config.es_username, config.es_password), timeout = 20)

es_client.indices.delete(
    index   = config.es_combined_index, 
    ignore  = 404)
es_client.indices.create(
    index       = config.es_combined_index  , 
    settings    = mapping['settings']       , 
    mappings    = mapping['mappings']       )
# play with chunk size parameter for timed out problem
for file_name, df in data:
    print(f'Ingesting "{file_name}"...')
    final_json = df.to_dict(orient = 'records')
    deque(parallel_bulk(es_client, actions = final_json, index = config.es_combined_index, max_chunk_bytes = 5 * 1024 * 1024), maxlen = 0)

es_client.indices.refresh()