# Scoring script playground

## Main script for running queries

In [None]:
import requests
import random

# URL     = 'https://dev.api.chat.ask.eduworks.com/webhooks/rest/webhook'
URL     = 'http://localhost:5005/webhooks/rest/webhook'

DATA    = {
    "message"   : "How do you treat peach leaf curl?",
    "sender"    : str(random.randint(0, 100000))
}

r = requests.post(URL, json = DATA)

if r.status_code == 200:
    print(r.json())
else:
    print('Service unavailable')

In [None]:
import random

sender = random.randint(0, 100000)

# URL     = 'https://dev.api.chat.ask.eduworks.com/webhooks/rest/webhook'
URL     = 'http://localhost:5005/webhooks/rest/webhook'
DATA    = {
    "message"   : "", 
    "sender"    : str(sender)
}
def run_tests(questions):
    
    results = []
    for i, q in enumerate(questions):

        DATA['message'] = q
        try:
            response = requests.post(URL, json = DATA)
            if response.status_code != 200:
                print(f'Error: Service at {URL} is unavailable, exit.')
                exit()

        except Exception as e:
            print(f'Error: Exception at posting question - "{q}", exit. {type(e).__name__}: "{e}".')
            exit()

        
        try:
            r = response.json()
            success = False
            for r1 in r:
                if 'custom' in r1:
                    success = True
                    r = r1['custom']['data']
                    break
        except Exception as e:
            print(f'Error: Failed on parsing response on question - "{q}", exit. {type(e).__name__}: "{e}".')
            exit()

        
        result = []
        if success:
           
            try:
                if len(r) == 0:
                    raise Exception
            except Exception as e:
                print(f'Error: Failed on parsing response on question - "{q}", exit. . {type(e).__name__}: "{e}".')

            for r1 in r:
                result.append(r1['meta'])
            

            DATA['message'] = '/intent_affirm'
            try:
                response = requests.post(URL, json = DATA)
                if response.status_code != 200:
                    print(f'Error: Service at {URL} is unavailable, exit.')
                    exit()
            except Exception as e:
                print(f'Error: Exception at posting affirmative message on question - "{q}", exit. {type(e).__name__}: "{e}".')
                exit()
            
            try:
                r = response.json()
                r = r[0]['text']
                if 'Anything else I can help with?' != r:
                    raise Exception
            except Exception as e:
                print(f'Error: Failed on parsing response of affirmative message on question - "{q}", exit. {type(e).__name__}: "{e}".')
                exit()
        else:
            print(f'No results for question - "{q}"')
            
        if (i+1)%5 == 0:
            print(f'Finished {i+1} questions...')
        
        results.append(result)
    
    print(f'Finished querying all ({len(questions)}) questions for scoring')

    return results


## Loading questions

### Valid questions stats

In [None]:
import pandas as pd

VALID_DATA  = './data/transformed/valid_questions.pkl'

df = pd.read_pickle(VALID_DATA)
print(f'Shape of NA questions: {df.shape}')
df.sample(10)

In [None]:
answers_pest    = df['CorrectTitle' ].values.tolist()
answers_url     = df['URL'          ].values.tolist()
questions       = df['Question'     ].values.tolist()
results         = run_tests(questions)

In [None]:
scores = []
for i, r in enumerate(results):
    answer = answers_url[i]
    topn = [False, False, False, False]
    for i1, r1 in enumerate(r):
        if r1['url'].split('?')[0] in answer:
            if i1 == 0:
                topn[0] = True
            if i1 < 3:
                topn[1] = True
            if i1 < 5:
                topn[2] = True
            topn[3] = True

    scores.append(topn)

In [None]:
top1    = 0
top3    = 0
top5    = 0
top10   = 0
for topn in scores:
    if topn[0]: top1    += 1
    if topn[1]: top3    += 1
    if topn[2]: top5    += 1
    if topn[3]: top10   += 1

print(f'Out of {len(results)} results, following correct:'  )
print(f'Top 1 : {top1 :<3d} ({top1 /len(results) * 100:<.2f}%)')
print(f'Top 3 : {top3 :<3d} ({top3 /len(results) * 100:<.2f}%)')
print(f'Top 5 : {top5 :<3d} ({top5 /len(results) * 100:<.2f}%)')
print(f'Top 10: {top10:<3d} ({top10/len(results) * 100:<.2f}%)')

### Valid questions errors

In [None]:
for i, score in enumerate(scores):
    if not any(score):
        print(f'------------------------------------------------------------------')
        print(f'Question number     : {i+1}'                        )
        print(f'Question            : {questions[i]}'               )
        print(f'The title answers   : {",".join(answers_pest[i])}'  )
        print(f'The URL answers     : {answers_url[i]}'             )
        if results[i]:
            for i1, res in enumerate(results[i]):
                print(f'Results ranked {i1+1:<2}   : {res["title"][:30] + "...":<33} {"(URL: " + res["url"]:>10s})')
                top_score = res["scores"]["top_score_1"]
                print(f'Top 1 score: {top_score["score"]:.3f} at field "{top_score["field"]}" with text: "{top_score["text"]}"')
        else:
            print(f'No results found for this query...')
            

In [None]:
import re

error_results = {
    'question_number'   : [],
    'question_text'     : [],
    'answer_title'      : [],
    'answer_url'        : [],
    'result_urls'       : [],
    'top_scores'        : []
}

for i in range(10):
    error_results['result_urls' ].append([])
    error_results['top_scores'  ].append([])


for i, score in enumerate(scores):
    if not any(score):
        error_results['question_number' ].append(i+1            )
        error_results['question_text'   ].append(questions[i]   )
        error_results['answer_title'    ].append(re.sub('\s+', ' ', ', '.join(answers_pest[i])))
        error_results['answer_url'      ].append(re.sub('\s+', ' ', ', '.join(answers_url[i])))
        
        r_temp = ['No answer'   ] * 10
        s_temp = ['No score'    ] * 10
        for i1, res in enumerate(results[i]):
            r_temp[i1] = f'{res["title"][:30]} (URL: {res["url"]:>10s})'
            
            top_score = res["scores"]["top_score_1"]    
            s_temp[i1] = f'{top_score["score"]:.3f} - "{top_score["field"]}" - "{top_score["text"]}"'
        
        for i1, res in enumerate(r_temp):
            error_results['result_urls' ][i1].append(res)
        for i1, res in enumerate(s_temp):
            error_results['top_scores'  ][i1].append(res)


In [None]:
data = {
    'QuestionNumber': error_results['question_number'   ],
    'QuestionText'  : error_results['question_text'     ],
    'AnswerTitle'   : error_results['answer_title'      ],
    'AnswerURL'     : error_results['answer_url'        ],
    'ResultRank1'   : error_results['result_urls'   ][0],
    'TopScore1'     : error_results['top_scores'    ][0],
    'ResultRank2'   : error_results['result_urls'   ][1],
    'TopScore2'     : error_results['top_scores'    ][1],
    'ResultRank3'   : error_results['result_urls'   ][2],
    'TopScore3'     : error_results['top_scores'    ][2],
    'ResultRank4'   : error_results['result_urls'   ][3],
    'TopScore4'     : error_results['top_scores'    ][3],
    'ResultRank5'   : error_results['result_urls'   ][4],
    'TopScore5'     : error_results['top_scores'    ][4],
    'ResultRank6'   : error_results['result_urls'   ][5],
    'TopScore6'     : error_results['top_scores'    ][5],
    'ResultRank7'   : error_results['result_urls'   ][6],
    'TopScore7'     : error_results['top_scores'    ][6],
    'ResultRank8'   : error_results['result_urls'   ][7],
    'TopScore8'     : error_results['top_scores'    ][7],
    'ResultRank9'   : error_results['result_urls'   ][8],
    'TopScore9'     : error_results['top_scores'    ][8],
    'ResultRank10'  : error_results['result_urls'   ][9],
    'TopScore10'    : error_results['top_scores'    ][9]
}

df_errors = pd.DataFrame(data = data).set_index('QuestionNumber')
print(f'Shape of error answers dataframe: {df_errors.shape}')
df_errors.sample(10)

In [None]:
ERRORS_VALID_DATA = './data/errors/errors_valid.csv'
df_errors.to_csv(ERRORS_VALID_DATA)

### NA questions stats

In [None]:
import pandas as pd

NA_DATA = './data/transformed/na_questions.pkl'

df = pd.read_pickle(NA_DATA)
print(f'Shape of NA questions: {df.shape}')
df.sample(10)

In [None]:
questions   = df['Question'].values.tolist()
results     = run_tests(questions)

In [None]:
no_results = 0
for i, res in enumerate(results):
    if len(res) == 0:
        no_results += 1

print(f'Out of {len(questions)} NA questions {no_results} have correctly returned 0 results')
print(f'Recall: {no_results/len(questions) * 100:.2f}%')

### NA questions errors

In [None]:
for i, res in enumerate(results):
    if len(res) > 0:
        print(f'------------------------------------------------------------------')
        print(f'Question number     : {i+1}')
        print(f'Question            : {questions[i]}')
        print(f'Number of results   : {len(res)}')
        for i1, res in enumerate(results[i]):
            print(f'Results ranked {i1+1:<2}   : {res["title"][:30] + "...":<33} {"(URL: " + res["url"]:>10s})')

In [None]:
error_results = {
    'question_number'   : [],
    'question_text'     : [],
    'result_urls'       : []
}

for i in range(10):
    error_results['result_urls'].append([])


for i, res in enumerate(results):
    if len(res) > 0:
        error_results['question_number' ].append(i+1            )
        error_results['question_text'   ].append(questions[i]   )
        
        r_temp = ['No answer'] * 10
        for i1, res in enumerate(results[i]):
            r_temp[i1] = f'{res["title"][:30] + "...":<33} {"(URL: " + res["url"]:>10s})'
        for i1, res in enumerate(r_temp):
            error_results['result_urls'][i1].append(res)


In [None]:
data = {
    'QuestionNumber': error_results['question_number'   ],
    'QuestionText'  : error_results['question_text'     ],
    'ResultRank1'   : error_results['result_urls'][0],
    'ResultRank2'   : error_results['result_urls'][1],
    'ResultRank3'   : error_results['result_urls'][2],
    'ResultRank4'   : error_results['result_urls'][3],
    'ResultRank5'   : error_results['result_urls'][4],
    'ResultRank6'   : error_results['result_urls'][5],
    'ResultRank7'   : error_results['result_urls'][6],
    'ResultRank8'   : error_results['result_urls'][7],
    'ResultRank9'   : error_results['result_urls'][8],
    'ResultRank10'  : error_results['result_urls'][9]
}

df_errors = pd.DataFrame(data = data).set_index('QuestionNumber')
print(f'Shape of error answers dataframe: {df_errors.shape}')
df_errors.sample(10)

In [None]:
ERRORS_NA_DATA = './data/errors/errors_na.csv'
df_errors.to_csv(ERRORS_NA_DATA)