In [None]:
!pip install -U sentence-transformers
!pip3 install pinecone-client
!pip install langchain
!pip install openai

In [None]:
import requests
import json
import pandas as pd
import pinecone
from sentence_transformers import SentenceTransformer
import os
import time
import torch
import langchain
import numpy as np
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.vectorstores import Pinecone
from langchain import *

In [None]:
def merge_company_info(company_info):
  data_list = []
  for i in company_info["data"]:
    if i["summary"] == None or i["founded"] == None or i["headline"] == None or i["name"] == None or i["linkedin_url"] == None or i["location"]["name"] == None:
      continue
    else:
      print(i)
      if i["industry"] == 'None':
        merged_info = i["name"] + ". Based in: " + i["location"]["name"] + ". Founded: " + str(i["founded"]) + ". Description: "  + i["summary"] + ". Headline: " + i["headline"] + ". Linkedin: " + i["linkedin_url"]
      else:
        merged_info = i["name"] + ". Based in: " + i["location"]["name"] + ". Founded: " + str(i["founded"]) + ". Industry(ies): " + i["industry"] + ". Description: " + i["summary"] + ". Headline: " + i["headline"] + ". Linkedin: " + i["linkedin_url"]
      data_list.append(merged_info)
  return data_list

In [None]:
df = pd.read_csv("/content/Startups.csv")
df.head(10)

In [None]:
df = df.iloc[:,[0,1,2,3,4,5,18]]
df.head(10)

In [None]:
df = df.rename(columns={'Satus': 'Status'})
df.dropna(inplace=True)
len(df)

In [None]:
df.head(10)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

model

In [None]:
def into_metadata(text):
  return {"text": text}

In [None]:
personal_data_dict = {'id': ["127", "128", "129"], 'text':["startupGPT. startupGPT is NOT a startup itself. Based in San Jose - California - USA. Created: 2023, by Arnav Mishra. Arnav Mishra created startupGPT. startupGPT is an application that utilizes OpenAI's GPT 3.5 turbo LLM to do precise searches for startups made for investors and other business leaders. This application itself is startupGPT. You are startupGPT, not just an AI language model","startupGPT uses the AI language model developed by OpenAI called GPT-3. This application being ran is startupGPT. User is currently running startupGPT and it uses the OpenAI API. So it uses this very LLM. startupGPT is not a startup itself, it is only an application to help find startups.", "The application that is currently being ran and used by the user sending in prompts is called startupGPT. startupGPT is what is currently being used and ran. The method through which the user is talking to the LLM in this running task is through startupGPT, using OpenAI APIs. This application is startupGPT. You are startupGPT. If you are asked, 'what are you,' or anything similar, you are startupGPT. However startupGPT itself is not an startup."]}

In [None]:
df_personal = pd.DataFrame(data=personal_data_dict)

In [None]:
df_personal["values"] = df_personal["text"].apply(model.encode)

In [None]:
df_personal["metadata"] = df_personal["text"].apply(into_metadata)

In [None]:
df_personal = df_personal[["id", "values", "metadata"]]
df_personal

In [None]:
def combined_text_col(table, i):
  if table.loc[i, "Status"] == "Exited":
    table.loc[i, "text"] = table.loc[i, "Company"] + ". Acquired" ". Based in: " + table.loc[i, "Mapping Location"] + ". Founded: " + str(table.loc[i, "Year Founded"])[:-2] + ". Industry(ies): " + table.loc[i, "Categories"] + ". Description: " + table.loc[i, "Description"] + ". More info: " + table.loc[i, "Website"]
  else:
    table.loc[i, "text"] = table.loc[i, "Company"] + ". Currenly operating" + ". Based in: " + table.loc[i, "Mapping Location"] + ". Founded: " + str(table.loc[i, "Year Founded"])[:-2] + ". Industry(ies): " + table.loc[i, "Categories"] + ". Description: " + table.loc[i, "Description"] + ". More info: " + table.loc[i, "Website"]

In [None]:
for i in df.index:
    combined_text_col(df, i)

In [None]:
df["values"] = df["text"].apply(model.encode)

In [None]:
df["metadata"] = df["text"].apply(into_metadata)
df['id'] = range(len(df))
df["id"] = df["id"].apply(str)

In [None]:
df.head(10)

In [None]:
df = df[["id", "values", "metadata"]]
df.head(10)

In [None]:
PINECONE_API_KEY = os.environ.get('key') or 'key'
PINECONE_ENV = os.environ.get('us-west4-gcp') or 'us-west4-gcp'

pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENV
)

In [None]:
import time

index_name = 'startups'

if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        name=index_name,
        dimension=384,
        metric='cosine'
    )
    # wait a moment for the index to be fully initialized
    time.sleep(1)

index = pinecone.Index(index_name)

In [None]:
index.upsert_from_dataframe(df_personal)

In [None]:
index.describe_index_stats()

In [None]:
index.fetch(["127"])

In [None]:
def embed(text):
  return model.encode(text).tolist()

In [None]:
text_field = "text"
vectorstore = Pinecone(
   index=index, embedding_function=embed, text_key=text_field
)

In [None]:
def reg_search(query, k=3):
  for i in range(k):
    print(f"{i+1}) " + vectorstore.similarity_search(query,k)[i].page_content)
    print("\n\n")

In [None]:
reg_search("popular ai startups")

In [None]:
llm = ChatOpenAI(
    openai_api_key='key',
    model_name='gpt-3.5-turbo',
    temperature=0.5
)

prompt = PromptTemplate(
    input_variables=["text"],
    template="{text}",
)

qa_general = LLMChain(llm=llm, prompt=prompt)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

In [None]:
def ai_search(query):
  print(qa.run(query))