In [None]:
# Install the OpenAI library quietly (without output)
!pip install -q openai

# Import necessary libraries
import openai  # For accessing OpenAI's APIs
import os      # For managing environment variables
import pandas as pd  # For handling data in tabular format

# Set the OpenAI API key securely using environment variables (recommended over hardcoding)
os.environ['OPENAI_API_KEY'] = 'sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
openai.api_key = os.getenv('OPENAI_API_KEY')

# Load the unicorn startup dataset into a DataFrame
df = pd.read_csv("unicorns.csv")

# Display the column names in the dataset
print(df.columns)

# Show the first 5 rows to get a quick overview
print(df.head())

# Display detailed info about the dataset: column types, null values, memory usage, etc.
print(df.info())


In [None]:
import ast  # For safely evaluating string representations of Python data structures

# Function to generate a summary text for each company based on its metadata
def summary(company, crunchbase_url, city, country, industry, investor_list):
    investors = "The investors in the company are "

    # Convert the stringified list of investors back to a Python list
    for investor in ast.literal_eval(investor_list):
        investors += f"{investor},"

    # Build a full descriptive sentence about the company
    text = (
        f"{company} has headquarters in {city} in {country} and is in the field of {industry}. "
        f"{investors}. More info at {crunchbase_url}"
    )
    return text

# Apply the summary function to each row in the DataFrame and create a new 'summary' column
df['summary'] = df.apply(lambda row: summary(
    row['Company'], row['Crunchbase Url'], row['City'],
    row['Country'], row['Industry'], row['Investors']), axis=1)

# Optional: Display one summary nicely formatted to 100-character width
import textwrap
text = df['summary'][1]
wrapped_text = textwrap.fill(text, width=100)
print(wrapped_text)


In [None]:
# Install the tiktoken library (used for token counting, compatible with OpenAI models)
!pip install tiktoken

import tiktoken  # Tokenizer library for OpenAI models

# Function to count the number of tokens in a string using a specified encoding
def num_tokens_from_string(string, encoding_name="cl100k_base"):
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

# Calculate the number of tokens for each summary and store it in a new column
df["token_count"] = df['summary'].apply(num_tokens_from_string)

# Estimate the total number of tokens and cost (based on $0.0001 per 1,000 tokens)
total_tokens = int(df["token_count"].sum())
cost = float(total_tokens * 0.0001 / 1000)

# Print total token count and estimated embedding cost
print(f"Total tokens: {total_tokens}, Estimated cost: ${cost:.4f}")


In [None]:
# Function to get an embedding vector for a given text using OpenAI's embedding model
def get_embedding(text):
    response = openai.embeddings.create(
        input=text,
        model='text-embedding-ada-002'  # Efficient and low-cost embedding model
    )
    return response.data[0].embedding  # Return the embedding vector from the response

# Apply the embedding function to each summary and store the result in a new column
df['embedding'] = df['summary'].apply(get_embedding)

# Save the updated DataFrame with embeddings to a new CSV file
df.to_csv('my_embedding.csv', index=False)


In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity  # For measuring similarity between vectors

# Function to compute cosine similarity between two vectors
def vector_similarity(vec1, vec2):
    return cosine_similarity([vec1], [vec2])[0][0]

# Example user query
prompt = "What does the company Minio do and where is the HQ?"

# Generate the embedding vector for the user query
prompt_embedding = get_embedding(prompt)

# Compute similarity between the query embedding and each company's embedding in the DataFrame
df['prompt_similarity'] = df['embedding'].apply(lambda vector: vector_similarity(vector, prompt_embedding))

# Find the company with the highest similarity score to the query
most_similar = df.nlargest(1, 'prompt_similarity').iloc[0]

# Print the summary of the most relevant company
print(most_similar['summary'])


In [None]:
# Prepare the context and question prompt for the chat completion model
context = most_similar['summary']

follow_up_prompt = f'''Only answer the question below if you have 100% certainty of the facts.
Context: {context}
Q: What does the start-up company Pentera do and who invested in it?
A:
'''

# Send the prompt to OpenAI's chat completion API (GPT-3.5 Turbo)
response = openai.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "user", "content": follow_up_prompt}
    ],
    temperature=0,    # Set temperature to 0 for deterministic and precise answers
    max_tokens=512    # Limit response length to 512 tokens
)

# Print the model's reply
print(response.choices[0].message.content)
