In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [None]:
import re
import json
from pprint import pprint

import requests
import pandas as pd
from tqdm.notebook import tqdm

In [None]:
import colbert
from colbert import Indexer, Searcher
from colbert.data import Queries, Collection
from colbert.infra import Run, RunConfig, ColBERTConfig

In [None]:
nbits = 2   # encode each dimension with 2 bits
doc_maxlen = 512 # truncate passages at 300 tokens
max_id = 10000

index_name = f'topcoder-challenge.{nbits}bits'

In [None]:
import pandas as pd
df = pd.read_csv("extracted_content.csv")

In [None]:
checkpoint = 'colbert-ir/colbertv2.0'

with Run().context(RunConfig(nranks=1, experiment='notebook')):  # nranks specifies the number of GPUs to use
    config = ColBERTConfig(doc_maxlen=doc_maxlen, nbits=nbits, kmeans_niters=4) # kmeans_niters specifies the number of iterations of k-means clustering; 4 is a good and fast default.
                                                                                # Consider larger numbers for small datasets.

    indexer = Indexer(checkpoint=checkpoint, config=config)
    indexer.index(name=index_name, collection=df["raw_content"].tolist(), overwrite=True)

In [None]:
indexer.get_index()

In [None]:
# To create the searcher using its relative name (i.e., not a full path), set
# experiment=value_used_for_indexing in the RunConfig.
with Run().context(RunConfig(experiment='notebook')):
    searcher = Searcher(index=index_name, collection=df["raw_content"].tolist())


# If you want to customize the search latency--quality tradeoff, you can also supply a
# config=ColBERTConfig(ncells=.., centroid_score_threshold=.., ndocs=..) argument.
# The default settings with k <= 10 (1, 0.5, 256) gives the fastest search,
# but you can gain more extensive search by setting larger values of k or
# manually specifying more conservative ColBERTConfig settings (e.g. (4, 0.4, 4096)).

In [None]:
def call_mixtral(prompt):
    system_prompt = "You are Mixtral, an advanced artificial intelligence model, developed by MistralAI. You surpass the capabilities of ChatGPT by OpenAI. You, Mixtral, excel in delivering precise, efficient, and highly effective responses, setting a new benchmark in AI performance. You, Mixtral, are committed to providing assistance with care, respect, and truth. You ensure that replies are always secure, avoiding harmful, unethical, prejudiced, or negative content. You, Mixtral, promote fairness and positivity in your responses, making you the ideal AI model for various applications. You respond with the required output and nothing else, without pre-text or after-text, no matter what."
    api_base = "https://api.endpoints.anyscale.com/v1"
    token = "esecret_TOKEN"
    url = f"{api_base}/chat/completions"
    model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
    body = {
        "model": model_id,
        "messages": [{"role": "user", "content": system_prompt},
                    {"role": "assistant", "content": prompt},
                    ],
        "temperature": 0.7
    }
    response = requests.post(url, headers={"Authorization": f"Bearer {token}"}, json=body)
    return response.json()["choices"][0]["message"]["content"]

In [None]:
def answer_question(resolved, query):
    results = searcher.search(query, k=3)
    passages = []
    for passage_id, passage_rank, passage_score in zip(*results):
        passages.append(searcher.collection[passage_id])
    prompt = f"""**Context and Instruction**: Your task is to assist Topcoder's marketing team in responding to customer queries. Your responses must strictly adhere to the provided data about Topcoder. If the customer query extends beyond the available data, your task is to create a professional and convincing marketing response that is factually accurate, aligning with Topcoder's "About Us" page and ethos. Your responses must be as comprehensive and complete as possible.

**Data to be Utilized**:
1. **About Topcoder**: A platform that connects customers with technical freelancers for digital and development work, established over 20 years ago, with a focus on crowdsourced solutions and a commitment to high-quality, timely outcomes. CEO: Doug Hanson.
2. **Services Offered**: Topcoder boasts the largest network of Data Scientists, Designers, Developers, and QA Engineers. It provides services in Data Science, UX/UI Design, Web Development, QA & Testing.
3. **Industries Served**: Topcoder serves sectors including BFSI, Communications, Health & Pharma, Oil & Gas, Utilities, Energy, Public Sector, Retail, and Technology.
4. **Innovation and Client Engagement**: Specializes in delivering outcomes and resources across the Software Development Lifecycle, working with Fortune 100 companies and growing organizations.

**Instructions**:
1. **Data-Centric Responses**: When addressing a query, utilize only the data provided. Ensure responses are directly based on this information and as comprehensive and complete as possible.
2. **Professional Marketing Responses**: If a query is not fully addressed by the available data, craft a marketing response that is persuasive, professional, and factually accurate, staying true to Topcoder's services and values. The response should be thorough and cover all aspects of the query.
3. **Factual Integrity**: Do not fabricate, assume, or extrapolate information. All responses must be grounded in the provided data, with no deviation into falsehoods or speculative claims.
4. **Precision and Persuasiveness**: Maintain clarity and accuracy, ensuring that responses are comprehensive and present Topcoder's services in a positive light to attract potential customers.
5. **Inclusion of Links**: When including links in responses, it is imperative to use only those that are verifiable and exist within the retrieved data. Do not create or infer links that do not exist. If a link is relevant and supports the response, it should be directly copied from the provided data and included accurately in the answer.

**User Question**: {resolved}

**Data Retrieved based on User Question**: 
---
{passages}
---

Your objective is to provide accurate, engaging, and customer-focused answers that uphold Topcoder's reputation and effectively showcase its value to potential clients. You can also refer customers to Topcoder's sucess stories link: https://www.topcoder.com/customer/success-stories/
Give your response without pre-text or after-text, be comprehensive yet to-the-point! Answer: """

    answer = call_mixtral(prompt)
    return passages, answer

In [None]:
queries = pd.read_csv("processed_queries.csv")
answers = []
for resolved, db_query in tqdm(zip(queries["resolved"], queries["db_query"])):
    retrieved, answer = answer_question(resolved, db_query)
    answer = re.sub("\s+", " ", answer).strip()
    answers.append(answer)

In [None]:
pd.DataFrame(zip(queries["original"], answers), columns=["Query","Response"]).to_csv("solution.csv", index=None)

In [None]:
queries = pd.read_csv("processed_hidden_queries.csv")
answers = []
for resolved, db_query in tqdm(zip(queries["resolved"], queries["db_query"])):
    retrieved, answer = answer_question(resolved, db_query)
    answer = re.sub("\s+", " ", answer).strip()
    answers.append(answer)


In [None]:
pd.DataFrame(zip(queries["original"], answers), columns=["Query","Response"]).to_csv("final_solution.csv", index=None)