# Build Multilingual Financial Search Applications with Cohere - Code Walkthrough
In the following use case example, we’ll showcase how Cohere’s Embed model can search and
query across financial news in different languages in one unique pipeline. Finally, we’ll see how
adding Rerank to our embeddings retrieval (or adding it to a legacy lexical search) can further
improve our results.

### Install Packages and Import Modules

In [None]:
!pip install --upgrade cohere-aws hnswlib
# If you upgrade the package, you need to restart the kernel

In [None]:
import pandas as pd
import cohere_aws
import hnswlib
import warnings
import os
import re
import boto3
warnings.filterwarnings('ignore')

### Import Documents 

Information about MultiFIN paper and data can be found in its Github repo https://github.com/RasmusKaer/MultiFin.

We will be using a csv that contains the data plus google translations of the articles

In [None]:
# Read the data
url = "https://raw.githubusercontent.com/cohere-ai/cohere-aws/main/notebooks/bedrock/multiFIN_train.csv"
df = pd.read_csv(url)

In [None]:
# Inspect dataset
df.head(5)

In [None]:
# Check language distribution
df['lang'].value_counts()

### Select List of Documents to Query

We need to do a quick cleaning and then we will select the articles we will be querying

In [None]:
# We want to select the longest articles, but some are long just due to repeated text - we will clean that up
df['text'].iloc[2215]

In [None]:
# Ensure there is no duplicated text in the headers
def remove_duplicates(text):
    return re.sub(r'((\b\w+\b.{1,2}\w+\b)+).+\1', r'\1', text, flags=re.I)

df ['text'] = df['text'].apply(remove_duplicates)

# Keep only selected languages
languages = ['English', 'Spanish', 'Danish']
df = df.loc[df['lang'].isin(languages)]

# Pick the top 80 longest articles
df['text_length'] = df['text'].str.len()
df.sort_values(by=['text_length'], ascending=False, inplace=True)
top_80_df = df[:80]

# Language distribution
top_80_df['lang'].value_counts()

In [None]:
# As an example below is our longest article
top_80_df['text'].iloc[1]

### Embed and Index Documents

In [None]:
# Establish Cohere client
co = cohere_aws.Client(mode=cohere_aws.Mode.BEDROCK)
model_id = "cohere.embed-multilingual-v3"

# Embed documents
docs = top_80_df['text'].to_list()
docs_lang = top_80_df['lang'].to_list()

translated_docs = top_80_df['translation'].to_list() #for reference when returning non-English results
doc_embs = co.embed(texts=docs, model_id=model_id, input_type='search_document').embeddings

# Create a search index with hnswlib, a library for fast approximate nearest neighbor search
index = hnswlib.Index(space='ip', dim=1024) # Cohere.embed-multilingual-v3 outputs embeddings with 1024 dimensions
index.init_index(max_elements=len(doc_embs), ef_construction=512, M=64) # For more info: https://github.com/nmslib/hnswlib#api-description
index.add_items(doc_embs, list(range(len(doc_embs))))

### Build a Retrieval System

In [None]:
# Retrieval of 50 closest docs to query
def retrieval(query):
    # Embed query and retrieve results
    query_emb = co.embed(texts=[query], model_id=model_id, input_type="search_query").embeddings
    
    doc_ids = index.knn_query(query_emb, k=50)[0][0] # we will retrieve 50 closest neighbors

    print(f"DOCUMENT IDs returned => {doc_ids} \n" )
    
    # Print and append results
    print(f"-> QUERY: '{query.upper()}' \n")
    print("-> Results: \n")
    retrieved_docs, translated_retrieved_docs = [], []
    
    for doc_id in doc_ids:
        # Append results
        retrieved_docs.append(docs[doc_id])
        translated_retrieved_docs.append(translated_docs[doc_id])
    
        # Print results
        print(f"ORIGINAL ({docs_lang[doc_id]}): {docs[doc_id]}")
        if docs_lang[doc_id] != "English":
            print(f"TRANSLATION: {translated_docs[doc_id]} \n----")
        else:
            print("----")
    print("END OF RESULTS \n\n")
    return retrieved_docs, translated_retrieved_docs

### Query and Improve Results with Cohere Rerank

In [None]:
#Query without Rerank

query = "Are companies ready for the next down market?"
retrieved_docs, translated_retrieved_docs = retrieval(query)

In [None]:
cohere_package = "cohere-rerank-multilingual-v3--96e1eea512fe31ae9c0639e56d40d853"

model_package_map = {
    "us-east-1": f"arn:aws:sagemaker:us-east-1:865070037744:model-package/{cohere_package}",
    "us-east-2": f"arn:aws:sagemaker:us-east-2:057799348421:model-package/{cohere_package}",
    "us-west-1": f"arn:aws:sagemaker:us-west-1:382657785993:model-package/{cohere_package}",
    "us-west-2": f"arn:aws:sagemaker:us-west-2:594846645681:model-package/{cohere_package}",
    "ca-central-1": f"arn:aws:sagemaker:ca-central-1:470592106596:model-package/{cohere_package}",
    "eu-central-1": f"arn:aws:sagemaker:eu-central-1:446921602837:model-package/{cohere_package}",
    "eu-west-1": f"arn:aws:sagemaker:eu-west-1:985815980388:model-package/{cohere_package}",
    "eu-west-2": f"arn:aws:sagemaker:eu-west-2:856760150666:model-package/{cohere_package}",
    "eu-west-3": f"arn:aws:sagemaker:eu-west-3:843114510376:model-package/{cohere_package}",
    "eu-north-1": f"arn:aws:sagemaker:eu-north-1:136758871317:model-package/{cohere_package}",
    "ap-southeast-1": f"arn:aws:sagemaker:ap-southeast-1:192199979996:model-package/{cohere_package}",
    "ap-southeast-2": f"arn:aws:sagemaker:ap-southeast-2:666831318237:model-package/{cohere_package}",
    "ap-northeast-2": f"arn:aws:sagemaker:ap-northeast-2:745090734665:model-package/{cohere_package}",
    "ap-northeast-1": f"arn:aws:sagemaker:ap-northeast-1:977537786026:model-package/{cohere_package}",
    "ap-south-1": f"arn:aws:sagemaker:ap-south-1:077584701553:model-package/{cohere_package}",
    "sa-east-1": f"arn:aws:sagemaker:sa-east-1:270155090741:model-package/{cohere_package}",
}

region = boto3.Session().region_name
if region not in model_package_map.keys():
    raise Exception(f"Current boto3 session region {region} is not supported.")

model_package_arn = model_package_map[region]

In [None]:
#Initialize and connect to Cohere Rerank endpoint

import cohere_aws
region='us-east-1'
co = cohere_aws.Client(region_name=region)
co.create_endpoint(arn=model_package_arn, endpoint_name="cohere-rerank-multilingual", instance_type="ml.g5.xlarge", n_instances=1)

# If the endpoint is already created, you just need to connect to it
co.connect_to_endpoint(endpoint_name="cohere-rerank-multilingual")

Once endpoint has been created, you would be able to perform real-time inference.

In [None]:
#Query with Rerank

results = co.rerank(query=query,documents=retrieved_docs, top_n=10)

i=0
for hit in results:
    i+=1
    print("\t{} was({})\t{}".format(i, hit.index, hit.document["text"].replace("\n", " ")))

# Clean-up
Delete the model

Now that you have successfully performed a real-time inference, you do not need the endpoint any more. You can terminate the endpoint to avoid being charged.


In [None]:
co.delete_endpoint()
co.close()