# Watson Attempt 2

### Imports

In [3]:
import os
import sys
import math
import pandas as pd
import numpy as np
import json

from whoosh.fields import Schema, TEXT, ID
from whoosh.index import create_in, open_dir, exists_in
from whoosh.qparser import QueryParser, OrGroup
from whoosh.scoring import BM25F
from whoosh.analysis import RegexTokenizer, LowercaseFilter, Filter
from whoosh.analysis import Filter                                

from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer

In [4]:
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    api_key = input("Enter your OpenAI API Key: ")

In [5]:
def query_ChatGPT(query):
    import requests

    model = "gpt-3.5-turbo-0301" # TODO

    headers = {
        "Content-Type": "application/json",
        "Authorization": "Bearer " + api_key,
    }

    json_data = {
            "model": model,
            "temperature": 0,
            "messages": [
                {
                    "role": "user",
                    "content": query
                }
            ]
        }

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=json_data).json()
    assert "choices" in response, response
    assert len(response["choices"]) > 0, response
    assert "message" in response["choices"][0], response
    assert "content" in response["choices"][0]["message"], response
    
    return response["choices"][0]["message"]["content"]

### Load the data

In [6]:
wiki_df = pd.read_pickle("./data/wiki.pkl")
wiki_redirects_df = pd.read_pickle("./data/wiki_redirects.pkl")
questions_df = pd.read_pickle("./data/questions.pkl")

### Build the lemmatize filter

In [7]:
lemmatizer = WordNetLemmatizer()

class LemmatizeFilter(Filter):
    def __call__(self, tokens):
        for token in tokens:
            token.text = lemmatizer.lemmatize(token.text)
            yield token

### Organize the redirect table

Organize the redirect table into a more efficient lookup table, where the keys are the index of the page redirected to in wiki_df, and the values are the titles of the redirect pages

In [8]:
redirect_lookups = {}
for _, row in wiki_redirects_df.iterrows():
    if row.redirect_index in redirect_lookups:
        redirect_lookups[row.redirect_index].append(row.title)
    else:
        redirect_lookups[row.redirect_index] = [row.title]

### Build the query filter

Build the query filter by getting a count of each term in the collection, and dropping terms with high frequency from the query

In [9]:
def normalize_term(term):
    return lemmatizer.lemmatize(term.lower())

In [10]:
count_vectorizer = CountVectorizer()
count_matrix = count_vectorizer.fit_transform(wiki_df.text)
terms = count_vectorizer.get_feature_names_out()
total_counts = np.asarray(np.sum(count_matrix, axis=0)).reshape(-1)
term_counts = {}
for term, count in zip(terms, total_counts):
    term = lemmatizer.lemmatize(term)
    if term in term_counts:
        term_counts[term] += count
    else:
        term_counts[term] = count

del count_matrix, total_counts, terms, count_vectorizer # clean up memory

In [11]:
def get_term_count(term):
    if len(term) <= 1:
        return float("inf")
    if term.isnumeric():
        return 0
    if term in term_counts:
        return term_counts[term]
    else:
        return 0

In [31]:
query_subset_p = 0.75 # the percentage of the query to keep

def filter_query(query):
    querys = query.split()
    if len(querys) <= 3:
        return query

    query_counts = sorted([(get_term_count(normalize_term(t)), i) for i, t in enumerate(querys)])
    n = math.ceil(len(querys) * query_subset_p)
    query_indices = sorted([i for _, i in query_counts[:n]])
    return " ".join([querys[i] for i in query_indices])

In [25]:
with open("ChatGPT_template1.txt", "r") as file:
    template1 = file.read()
def pass_query_through_ChatGPT(query):
    gptq = template1 + query
    try:
        result = query_ChatGPT(gptq)
        data = json.loads(result)
    except:
        print("JSON ERROR", result)
        return query
    return data["question"]

In [14]:
with open("ChatGPT_template2.txt", "r") as file:
    template2 = file.read()
def boost_important_terms(query):
    if len(query.split()) <= 3:
        return query
    
    gptq = template2 + query

    try:
        terms = json.loads(query_ChatGPT(gptq))
    except:
        return query
    
    if len(terms) < 3:
        return query
    
    try:
        term1 = terms["term1"]
        term2 = terms["term2"]
        term3 = terms["term3"]
    except:
        return query
    
    # boost the importance of the 3 least frequent terms TODO tune the boost level
    query = query.replace(term1, term1 + "^2")
    query = query.replace(term2, term2 + "^2")
    query = query.replace(term3, term3 + "^2")
    return query

In [15]:
def query_pipeline(query):
    query = pass_query_through_ChatGPT(query)
    query = filter_query(query)
    #query = boost_important_terms(query)
    return query

### Define the Watson class

In [27]:
class Watson:
    def __init__(self):
        self.Q = len(questions_df.index)
        self._analyzer = self._build_analyzer()
        self._index = self._build_index()
        self._title_index = self._build_title_index()
        self._parser = self._build_parser()

    def _build_analyzer(self):
        return RegexTokenizer() | LowercaseFilter() | LemmatizeFilter()
    
    def _build_index(self):
        if exists_in(".index"):
            ix = open_dir(".index")
        else:
            os.mkdir(".index")
            schema = Schema(title=ID(stored=True),  
                    titles=TEXT(analyzer=self._analyzer), 
                    categories=TEXT(analyzer=self._analyzer), 
                    content=TEXT(analyzer=self._analyzer))
            ix = create_in(".index", schema)
            with ix.writer() as writer:
                for _, row in wiki_df.iterrows():
                    writer.add_document(title=row.title, content=row.text)
                writer.commit()
        return ix
    
    def _build_title_index(self):
        if exists_in(".title_index"):
            ix = open_dir(".title_index")
        else:
            os.mkdir(".title_index")
            schema = Schema(title=ID(stored=True),  content=TEXT(analyzer=self._analyzer))
            ix = create_in(".title_index", schema)
            with ix.writer() as writer:
                for i, row in wiki_df.iterrows():
                    titles = [row.title]
                    if i in redirect_lookups:
                        titles += redirect_lookups[i]
                    writer.add_document(title=row.title, content=" ".join(titles))
                writer.commit()
        return ix
    
    def _build_parser(self):
        og = OrGroup.factory(0.9)
        return QueryParser("content", schema=self._index.schema, group=og)
    
    def search(self, category, question, scorer=BM25F):
        try:
            query = self._parser.parse(f"{category}^0.5 " + query_pipeline(question))
        except TypeError as e:
            print(query_pipeline(question))
            raise e
        with self._index.searcher(weighting=scorer()) as searcher:
            results = searcher.search(query, limit=None)
            if results.scored_length() == 0:
                return None
            return [(r["title"], r.rank+1) for r in results]

    def test(self, scorer=BM25F, eval="mrr"):
        if eval == "mrr":
            mrr = 0.0
            for _, row in questions_df.iterrows():
                results = self.search(row.category, row.question, scorer)
                rank = Watson.get_rank(results, row.answer)
                if rank > 0:
                    mrr += 1 / rank
            mrr /= self.Q
            return mrr
        elif eval == "p@1":
            correct = 0
            for _, row in questions_df.iterrows():
                results = self.search(row.category, row.question, scorer)
                if Watson.is_correct(results, row.answer):
                    correct += 1
            return correct / self.Q
        else:
            raise Exception(f"unrecognized evaluation type: {eval}")
        
    @staticmethod
    def get_rank(results, answer):
        for answer_variant in answer.split("|"):
            for (doc_title, rank) in results:
                if doc_title.lower() == answer_variant.lower():
                    return rank
        return 0
    
    @staticmethod
    def is_correct(results, answer):
        guess, _ = results[0]
        for answer_variant in answer.split("|"):
            if answer_variant.lower() == guess.lower():
                return True
        return False
    
    @staticmethod
    def get_guess(results):
        guess, _ = results[0]
        return guess
    

### Instantiate Watson

In [28]:
watson = Watson()

### Test Watson

In [32]:
mrr_score = watson.test(eval="mrr")
mrr_score

JSON ERROR is located in this city (Cincinnati, Ohio).

{
    "question": "The Taft Museum of Art is located in this city (Cincinnati, Ohio)."
}
JSON ERROR ("Santa Fe, New Mexico") is dedicated to the works of this American artist known for her paintings of enlarged flowers and New Mexico landscapes.

{
    "question": "The Georgia O'Keeffe Museum (located in Santa Fe, New Mexico) is dedicated to the works of this American artist known for her paintings of enlarged flowers and New Mexico landscapes."
}
JSON ERROR is a brand name for this gelatin dessert (a dessert made with a sweetened and flavored processed collagen product). 

{
    "question": "Jell-O is a brand name for this gelatin dessert (a dessert made with a sweetened and flavored processed collagen product)."
}
JSON ERROR , dominates the skyline. (Helsinki) 

{
    "question": "What Finnish city is dominated by the Lutheran Cathedral, also known as Tuomiokirkko? (The city is Helsinki)"
}
JSON ERROR (protagonist of the book se

0.30372436112296486

In [47]:
pa1_score = watson.test(eval="p@1")
pa1_score

KeyboardInterrupt: 