In [1]:
import os
proxy = os.environ["HTTPS_PROXY"]
api_key = os.environ["OPENAI_API_KEY"]

print(f"HTTPS_PROXY: {proxy}")
print("OPENAI_API_KEY: configured")

HTTPS_PROXY: 127.0.0.1:7078
OPENAI_API_KEY: configured


In [2]:
import json
import pandas as pd
import numpy as np

In [3]:
from openai import OpenAI
client = OpenAI()

# 词向量测试

In [4]:
COMPLETIONS_MODEL='gpt-3.5-turbo'
EMBEDDING_MODEL = "text-embedding-ada-002"

In [5]:
def get_embedding(text: str, model: str=EMBEDDING_MODEL) -> list[float]:
    result = client.embeddings.create(
      model=model,
      input=text
    )
    return result.data[0].embedding

def vector_similarity(x: list[float], y: list[float]) -> float:
    """
    Returns the similarity between two vectors.
    
    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))

def calc_embeddings(df):
    df['embedding'] = df['Questions'].apply(lambda s: get_embedding(s))
    df.to_csv('../data/embedded.csv', index=False)
    return df.head()

def ask(prompt: str, q: str, a: str):
    response = client.chat.completions.create(
        messages=[{"role": "user", "content": q},{"role":"assistant","content":a},{"role":"user","content":prompt}],
        temperature=0,
        max_tokens=1000,
        model=COMPLETIONS_MODEL
    )
    return response.choices[0].message.content

In [6]:
df = pd.read_csv('../data/md_QA_embedded.csv')
df['embedding'] = df['embedding'].apply(lambda x: json.loads(x))
df.head()

Unnamed: 0,ID,Questions,Answers,Attachment,RegBy,RegTime,Submit,SubmitBy,SubmitTime,embedding
0,1,上海市居住证业务-持证人居住地址发生变化怎么办？,持证人居住地住址等信息发生变化的，请在发生变化的30日内到现居住地街道（乡镇）社区事务受理服...,,1,2021-09-01 12:11:17.830,,,,"[0.017005516216158867, 0.013673987239599228, 0..."
1,2,上海市居住证业务-如何查询居住证办理信息？,居住证办理过程的状态信息，如受理时间、签发时间、有效期限、受理状态、审核状态、审核未通过原因...,,1,2021-09-01 12:11:17.830,,,,"[0.021357620134949684, 0.02255536988377571, -0..."
2,3,上海市居住证业务-办证时未提供手机号码或提供的号码有错误的，如何变更？,如需变更手机号码，请及时前往居住地街道（乡镇）社区事务受理服务中心办理信息变更手续。,,1,2021-09-01 12:11:17.830,,,,"[0.011813518591225147, 0.015791472047567368, -..."
3,4,上海市居住证业务-居住证有效期届满前30日内持证人可否自行办理签注手续？,可以。持证人自行办理签注手续成功后，系统不再自动签注。,,1,2021-09-01 12:11:17.830,,,,"[0.01432208251208067, 0.016711320728063583, 0...."
4,5,上海市居住证业务-收到居住证签注未通过或不符合自动签注条件的提醒短信，持证人应如何办理签注？,持证人可持证前往居住地街道（乡镇）社区事务受理服务中心咨询办理。,,1,2021-09-01 12:11:17.830,,,,"[0.0040360321290791035, 0.011765829287469387, ..."


In [7]:
q = "我的居住证快到期了，怎么续办？"

In [8]:
q_embedding = get_embedding(q)
q_embedding

[-0.0006315940408967435,
 0.008527767844498158,
 -0.006783603224903345,
 -0.020077867433428764,
 -0.030489599332213402,
 0.009839219972491264,
 -0.03704020380973816,
 -0.00042501537245698273,
 -0.019465412944555283,
 -0.017814448103308678,
 0.03935687988996506,
 0.01165661308914423,
 0.001789100468158722,
 0.005179237574338913,
 -0.011250529438257217,
 0.0009178501204587519,
 0.014392688870429993,
 -0.010145447216928005,
 0.038611285388469696,
 -0.02290048636496067,
 -0.013726976700127125,
 -0.0019072642317041755,
 -0.014885315671563148,
 0.001980492612347007,
 0.006896774284541607,
 0.012655180878937244,
 0.0041440557688474655,
 0.0032603235449641943,
 -0.00026545257424004376,
 -0.003588186576962471,
 0.022980371490120888,
 0.00041731807868927717,
 -0.025163905695080757,
 -0.017894333228468895,
 -0.0006228566053323448,
 -0.014645659364759922,
 -0.0020071209874004126,
 -0.0017757861642166972,
 0.012834923341870308,
 -0.01459240261465311,
 0.0016692723147571087,
 0.013740290887653828,
 

In [9]:
df['similarity'] = df['embedding'].apply(lambda x: vector_similarity(x, q_embedding))
df.head()

Unnamed: 0,ID,Questions,Answers,Attachment,RegBy,RegTime,Submit,SubmitBy,SubmitTime,embedding,similarity
0,1,上海市居住证业务-持证人居住地址发生变化怎么办？,持证人居住地住址等信息发生变化的，请在发生变化的30日内到现居住地街道（乡镇）社区事务受理服...,,1,2021-09-01 12:11:17.830,,,,"[0.017005516216158867, 0.013673987239599228, 0...",0.867331
1,2,上海市居住证业务-如何查询居住证办理信息？,居住证办理过程的状态信息，如受理时间、签发时间、有效期限、受理状态、审核状态、审核未通过原因...,,1,2021-09-01 12:11:17.830,,,,"[0.021357620134949684, 0.02255536988377571, -0...",0.868317
2,3,上海市居住证业务-办证时未提供手机号码或提供的号码有错误的，如何变更？,如需变更手机号码，请及时前往居住地街道（乡镇）社区事务受理服务中心办理信息变更手续。,,1,2021-09-01 12:11:17.830,,,,"[0.011813518591225147, 0.015791472047567368, -...",0.85094
3,4,上海市居住证业务-居住证有效期届满前30日内持证人可否自行办理签注手续？,可以。持证人自行办理签注手续成功后，系统不再自动签注。,,1,2021-09-01 12:11:17.830,,,,"[0.01432208251208067, 0.016711320728063583, 0....",0.881844
4,5,上海市居住证业务-收到居住证签注未通过或不符合自动签注条件的提醒短信，持证人应如何办理签注？,持证人可持证前往居住地街道（乡镇）社区事务受理服务中心咨询办理。,,1,2021-09-01 12:11:17.830,,,,"[0.0040360321290791035, 0.011765829287469387, ...",0.862049


In [10]:
sorted_df = df.sort_values(by='similarity', ascending=False)
sorted_df.head()

Unnamed: 0,ID,Questions,Answers,Attachment,RegBy,RegTime,Submit,SubmitBy,SubmitTime,embedding,similarity
7,8,上海市居住证业务-怎么办理临时居住证？,您好！《上海市临时居住证》须本人前往现居住证地的居住证受理点办理，须提供《来沪人员居住登记表...,,1,2021-09-01 12:11:17.830,,,,"[0.025090347975492477, 0.01781642436981201, -0...",0.888536
39,40,积分到期忘记续办了，已经过期半年了，还能续办吗？,根据目前政策可以续办，从系统中修改，提交完成后告知HR审批，审批完成后会告知需提供的纸质材料...,,1,2021-09-01 12:11:17.830,,,,"[0.012294166721403599, -0.0032590138725936413,...",0.884549
5,6,上海市居住证业务-收到居住证签注已通过的提醒短信，持证人是否需要办理卡面有效期限更新手续？,持证人如需更新卡面有效期限，可持证到就近街道（乡镇）社区事务受理服务中心办理。,,1,2021-09-01 12:11:17.830,,,,"[-0.0037496446166187525, 0.01798757165670395, ...",0.881984
3,4,上海市居住证业务-居住证有效期届满前30日内持证人可否自行办理签注手续？,可以。持证人自行办理签注手续成功后，系统不再自动签注。,,1,2021-09-01 12:11:17.830,,,,"[0.01432208251208067, 0.016711320728063583, 0....",0.881844
1,2,上海市居住证业务-如何查询居住证办理信息？,居住证办理过程的状态信息，如受理时间、签发时间、有效期限、受理状态、审核状态、审核未通过原因...,,1,2021-09-01 12:11:17.830,,,,"[0.021357620134949684, 0.02255536988377571, -0...",0.868317


In [11]:
best_q, best_a = sorted_df[['Questions', 'Answers']].iloc[0]
best_q, best_a

('上海市居住证业务-怎么办理临时居住证？',
 '您好！《上海市临时居住证》须本人前往现居住证地的居住证受理点办理，须提供《来沪人员居住登记表》、居民身份证、在本市的住所证明等相关材料办理。')

In [12]:
a = ask(q, best_q, best_a)
q, a, best_q, best_a

('我的居住证快到期了，怎么续办？',
 '您好！如果您的上海市居住证即将到期，您可以按照以下步骤进行续办：\n\n1. 提前办理：建议您在居住证到期前30天内开始办理续办手续，以确保能够及时办理完成。\n\n2. 准备材料：准备好以下材料：居民身份证、居住证原件、居住证到期前30天内的住所证明（例如租房合同、房产证明等）。\n\n3. 前往办理点：持上述材料前往您所在居住证受理点，办理续办手续。您可以在上海市公安局出入境管理局官网查询您所在区域的受理点。\n\n4. 填写申请表：在受理点填写《上海市居住证申请表》，并提交所需材料。\n\n5. 缴费：根据规定，续办居住证需要缴纳一定的费用，您需要按照要求缴纳相应费用。\n\n6. 等待审批：提交申请后，您需要等待相关部门的审批。一般情况下，审批时间为7个工作日。\n\n7. 领取居住证：审批通过后，您可以前往受理点领取新的居住证。\n\n请注意，以上步骤仅供参考，具体办理流程可能会因个人情况和政策变化而有所不同，建议您在办理前咨询当地公安局或相关部门，以确保能够顺利办理续办手续。',
 '上海市居住证业务-怎么办理临时居住证？',
 '您好！《上海市临时居住证》须本人前往现居住证地的居住证受理点办理，须提供《来沪人员居住登记表》、居民身份证、在本市的住所证明等相关材料办理。')

# 比较两句话的相似度

In [13]:
a = "我觉得你这里做的不好"
b = "我觉得你这里有待提高"

embedding_a = get_embedding(a)
embedding_b = get_embedding(b)

vector_similarity(embedding_a, embedding_b)

0.8964884928457343

# 生成知识库格式的文件

In [9]:
# Define the output file name
output_file = 'output.txt'

# Open the file for writing
with open(output_file, 'w', encoding='utf-8') as file:
    # Iterate through the rows of new_df
    for index, row in df.iterrows():
        # Write the question and answer to the file in the desired format
        file.write(f'Q: {row["Questions"]}\nA: {row["Answers"]}\n\n')