In [8]:
from sentence_transformers import SentenceTransformer
from ganacheconnector import contractConnector
import json
from web3 import Web3, HTTPProvider
import pandas as pd
import numpy as numpy
import pandas as pd
import os
import scipy

def encode_questions(model_name,clean_df):
	print("Instantiating Model")
	model = SentenceTransformer(model_name)
	print("Model Instantiated")
	questions = clean_df.question.tolist()
	question_embeddings = model.encode(questions)
	return question_embeddings, model_name

def format_embeddings(embeddings,model_name): 
	print("Formating Data")
	df_embeddings  = pd.DataFrame(embeddings)
	df_embeddings["model_name"] = model_name
	return df_embeddings

def save_embeddings(df_embeddings,partner_id):
	print("Saving Embeddings")
	direc = "EMBEDDINGS/"
	if not os.path.exists(direc):
		os.makedirs(direc)
	fp = "EMBEDDINGS/{}.csv".format(partner_id)
	print(df_embeddings)
	df_embeddings.to_csv(fp,index=None)

def run_all_train(model_name,clean_df,partner_id): 
	print("Starting Run All")
	question_embeddings, model_name = encode_questions(model_name,clean_df)
	df_embeddings = format_embeddings(question_embeddings,model_name)
	save_embeddings(df_embeddings,partner_id)
	print("Run all finished")

def encode_query(model_name,query):
	model = SentenceTransformer(model_name)
	print(query)
	queries = [query]
	print(queries)
	query_embeddings = model.encode(queries)
	return queries,query_embeddings

def semantic_search(queries,query_embeddings,question_embeddings,number_top_matches): 
	for query, query_embedding in zip(queries, query_embeddings):
		distances = scipy.spatial.distance.cdist([query_embedding], question_embeddings, "cosine")[0]

		results = zip(range(len(distances)), distances)
		results = sorted(results, key=lambda x: x[1])
		results = pd.DataFrame(results[0:number_top_matches])
		results = results.rename(columns={0:"idx",1:'distance'})
		results["cosine_score"] = 1 - results["distance"]
		return results

In [9]:
blockchain_address = 'http://127.0.0.1:8545'
compiled_contract_path = '../build/contracts/P2PTest.json'
deployed_contract_address = '0x6d72dC5be82840419bde6758D04Cc37102eB0473'
defaultAccount = 0

web3 = Web3(HTTPProvider(blockchain_address))
web3.eth.accounts[defaultAccount]
web3.eth.defaultAccount = web3.eth.accounts[defaultAccount]
c = contractConnector(blockchain_address,compiled_contract_path,deployed_contract_address,defaultAccount)

In [15]:
loan_search_model = SentenceTransformer("roberta-base")

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.08k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/603k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Some weights of the model checkpoint at /Users/adamlieberman/.cache/torch/sentence_transformers/roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [86]:
loan_search_model = SentenceTransformer("roberta-base")

def encode_descriptions(model,df):
	questions = df.shortDescription.tolist()
	question_embeddings = model.encode(questions)
	return question_embeddings

def encode_query(model,query):
	queries = [query]
	query_embeddings = model.encode(queries)
	return queries,query_embeddings

def semantic_search(queries,query_embeddings,question_embeddings,number_top_matches): 
	for query, query_embedding in zip(queries, query_embeddings):
		distances = scipy.spatial.distance.cdist([query_embedding], question_embeddings, "cosine")[0]

		results = zip(range(len(distances)), distances)
		results = sorted(results, key=lambda x: x[1])
		results = pd.DataFrame(results[0:number_top_matches])
		results = results.rename(columns={0:"idx",1:'distance'})
		results["cosine_score"] = 1 - results["distance"]
		return results

Some weights of the model checkpoint at /Users/adamlieberman/.cache/torch/sentence_transformers/roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [87]:
df_all_loans = c.get_all_loan_struct_info()
embeddings = encode_descriptions(loan_search_model,df_all_loans)

query = "Help someone with groceries"
queries,query_embeddings = encode_query(loan_search_model,query)

In [88]:
results = semantic_search(queries, query_embeddings, embeddings, 2)

In [89]:
results

Unnamed: 0,idx,distance,cosine_score
0,0,0.029157,0.970843
1,2,0.031657,0.968343


In [90]:
df_all_loans.iloc[results.idx.tolist()].loanDescription.tolist()

['I have not yet received my paycheck and I need a spot and some assistance getting groceries to feed my 3 small children.',
 'The next billing cycle for my credit card debt starts in 3 days. Although I have been diligently paying part of it off every month, the snowball gets bigger and bigger. I really don’t want to pay the high interest rate anymore. It will be helpful if I can get a small loan to pay it all off, then my monthly payment will be much more affordable.']

In [85]:
df_all_loans.loanDescription.tolist()

['I have not yet received my paycheck and I need a spot and some assistance getting groceries to feed my 3 small children.',
 'I have just moved into the neighborhood and am planning to rennovate my kitchen. Currently do not have enough funds to pay the job in full.',
 'The next billing cycle for my credit card debt starts in 3 days. Although I have been diligently paying part of it off every month, the snowball gets bigger and bigger. I really don’t want to pay the high interest rate anymore. It will be helpful if I can get a small loan to pay it all off, then my monthly payment will be much more affordable.',
 'I was a cleaning staff at the local shopping mall and lost my job during the pandemic. I have 2 children to take care of but paying bills on time becomes so difficult without a job. I hope to get a loan to get through this difficult time for my family.',
 'I got into a car accident last month and my car needs major repair. The person who hit me fled the scene and has not been 