In [32]:
!pip install pinecone-client
!pip install openai
!pip install tiktoken

import tiktoken
import pandas as pd
import numpy as np
import openai
import json
import urllib.request
import urllib.parse
import collections
import tqdm
import pinecone
import re
import chardet

from openai.embeddings_utils import get_embedding
from itertools import product

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [33]:
OPENAI_API_KEY = '<enter your openai api key>'
PINECONE_API_KEY = '<enter your pinecone api key>'
INDEX_NAME, INDEX_DIMENSION = 'aaltobot-search', 1536
openai.api_key = OPENAI_API_KEY

In [34]:
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment='us-west1-gcp'  # find in pinecone console next to api key
)

if INDEX_NAME not in pinecone.list_indexes():
    pinecone.create_index(name=INDEX_NAME, dimension=INDEX_DIMENSION)

index = pinecone.Index(index_name=INDEX_NAME)

In [35]:
clientURL = "https://aalto.fi"
CSV_Filepath = 'Aalto Admissions Data.csv'

In [36]:
with open(CSV_Filepath, 'rb') as file:
    detected_encoding = chardet.detect(file.read())['encoding']
    
with open(CSV_Filepath, 'r', encoding=detected_encoding) as file:
    df = pd.read_csv(file)

In [37]:
def get_num_of_tokens(message, model="gpt-3.5-turbo-0301"):
    """Returns the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if model == "gpt-3.5-turbo":
        print("Warning: gpt-3.5-turbo may change over time. Returning num tokens assuming gpt-3.5-turbo-0301.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301")
    elif model == "gpt-4":
        print("Warning: gpt-4 may change over time. Returning num tokens assuming gpt-4-0314.")
        return num_tokens_from_messages(messages, model="gpt-4-0314")
    elif model == "gpt-3.5-turbo-0301":
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif model == "gpt-4-0314":
        tokens_per_message = 3
        tokens_per_name = 1
    else:
        raise NotImplementedError(f"""num_tokens_from_messages() is not implemented for model {model}.""")
    num_tokens = 0
    num_tokens += tokens_per_message
    num_tokens += len(encoding.encode(message))
    num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens

In [38]:
def split_sentences(text):
    sentence_delimiters = re.compile(u'[.!?]')
    sentences = sentence_delimiters.split(text)
    return [s.strip() for s in sentences if s.strip()]

def split_text_by_tokens(df, column='Text', max_tokens=600):
    new_rows = []

    for index, row in df.iterrows():
        text = row[column]
        token_count = get_num_of_tokens(text)
        
        if token_count <= max_tokens:
            new_rows.append(row)
        else:
            sentences = split_sentences(text)
            partial_text = ''
            token_count = 0
            title_counter = 1
            
            for sentence in sentences:
                sentence_tokens = get_num_of_tokens(sentence)
                
                if token_count + sentence_tokens <= max_tokens:
                    partial_text += ' ' + sentence
                    token_count += sentence_tokens
                else:
                    new_row = row.copy()
                    new_row[column] = partial_text.strip()
                    new_row['Title'] = f"{row['Title']} {title_counter}"
                    new_rows.append(new_row)
                    
                    partial_text = sentence
                    token_count = sentence_tokens
                    title_counter += 1
            
            new_row = row.copy()
            new_row[column] = partial_text.strip()
            new_row['Title'] = f"{row['Title']} {title_counter}"
            new_rows.append(new_row)

    return pd.DataFrame(new_rows)

In [39]:
def add_num_of_tokens(df, column='Text'):
    token_counts = []
    for text in df[column]:
        token_count = get_num_of_tokens(text)
        token_counts.append(token_count)

    df['Tokens'] = token_counts
    return df

In [42]:
df_without_ID = df.drop(columns=['ID'])
df_chunked = split_text_by_tokens(df_without_ID)
df_chunked_tokens = add_num_of_tokens(df_chunked, column='Text')
df_chunked_tokens.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118 entries, 0 to 117
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Title   118 non-null    object
 1   Source  118 non-null    object
 2   Text    118 non-null    object
 3   Tokens  118 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 4.6+ KB


In [52]:
def get_metadata_from_df_row(df_row):
    """Return vector metadata."""
    metadata = {
        'Title': df_row["Title"],
        'Text': df_row["Text"],
        'Source': df_row["Source"],
        'Tokens': get_num_of_tokens(df_row["Text"]),
    }
    return metadata

def get_all_metadata_from_df(df):
    all_metadata = []
    for _, df_row in df.iterrows():
        metadata = get_metadata_from_df_row(df_row)
        all_metadata.append(metadata)
    return all_metadata

def get_processed_df_for_pinecone(df):
    df = pd.DataFrame({
        "ID": df['Title'],
        "Text": df['Text'],
        "Metadata": get_all_metadata_from_df(df),
    })
    
    return df

In [53]:
df_processed = get_processed_df_for_pinecone(df_chunked_tokens)

In [54]:
df_processed.head()

Unnamed: 0,ID,Text,Metadata
0,Admission Services 1,How to apply to Aalto? Aalto University has an...,"{'Title': 'Admission Services 1', 'Text': 'How..."
1,Admission Services 2,Admission to the Bachelor's Programme in Inter...,"{'Title': 'Admission Services 2', 'Text': 'Adm..."
2,Admission Services Contact Information,Contact Information for Aalto University Admis...,{'Title': 'Admission Services Contact Informat...
3,Admission Services Applying to Bachelors Progr...,Applying to Bachelor's Programmes: On this pag...,{'Title': 'Admission Services Applying to Bach...
4,Admission Services Applying to Bachelors Progr...,Field of Science and Technology: Aalto Bachelo...,{'Title': 'Admission Services Applying to Bach...


In [55]:
def get_metadatas_from_df(df, column="Metadata"):
    return df[column].tolist()

def get_embeddings_from_df(df, column="Text"):
    # Encode the 'Text' column using the OpenAI Embedding Ada 002 Model
    embeddings = []
    text_list = df[column]
    for text in text_list:
        embedding = get_embedding(text, engine="text-embedding-ada-002")
        embeddings.append(embedding)
    return embeddings

def get_ids_from_df(df, column="ID"):
    return df[column].tolist()

In [56]:
def get_vectors_to_upload_to_pinecone(df):
    """Return list of tuples like (vector_id, vector_values, vector_metadata)."""
    # create vector ids
    vector_ids = get_ids_from_df(df)
    # create (embeddings) vector values
    vector_values = get_embeddings_from_df(df)
    # create vector metadatas
    vector_metadata = get_metadatas_from_df(df)
    return list(zip(vector_ids, vector_values, vector_metadata))

In [57]:
pinecone_vectors = get_vectors_to_upload_to_pinecone(df_processed)

In [59]:
print(pinecone_vectors[0])
type(pinecone_vectors[0])

('Admission Services 1', [0.00721688661724329, 0.0013942444929853082, 0.012835748493671417, -0.032811060547828674, -0.02143157459795475, 0.010142303071916103, -0.03203782066702843, -0.0007325623300857842, -0.02382861264050007, -0.014549758285284042, 0.007551955990493298, -0.00532567547634244, -0.006630514282733202, -0.021380025893449783, 0.0025516848545521498, -0.03567203879356384, 0.017436513677239418, 0.005087260622531176, 0.004082051571458578, 0.002517855726182461, -0.03425443544983864, 0.007094457279890776, -0.020413478836417198, -0.028068533167243004, -0.017140105366706848, 0.005857276730239391, 0.006591852754354477, -0.027553042396903038, -0.0011550240451470017, -0.019691789522767067, 0.017526723444461823, 0.012378249317407608, -0.028532475233078003, -0.004726416431367397, -0.012520009651780128, -0.002579070394858718, -0.014820392243564129, -0.001212211442179978, 0.011050857603549957, -0.010219627059996128, 0.013828069902956486, 0.0133254649117589, 0.021972842514514923, -0.007017

tuple

In [60]:
def upload_vectors_to_pinecone(
    dataframe, 
    pinecone_index,
    clientURL,
    chunk_size=20000, 
    upsert_size=500):
    vectors = get_vectors_to_upload_to_pinecone(dataframe)
    async_results = []
    async_result = pinecone_index.upsert(vectors,namespace=clientURL,async_req=True)
    async_results.append(async_result)
    # wait for results
    _ = [async_result.get() for async_result in async_results]
    is_all_successful = all(map(lambda x: x.successful(), async_results))
    # report chunk upload status
    print(
    f'All upserts in chunk successful with index'
    f'{is_all_successful}. Vectors uploaded: {len(vectors):>3}.'
    )
    return async_results

In [61]:
async_results = upload_vectors_to_pinecone(df_processed, index, clientURL) 

All upserts in chunk successful with indexTrue. Vectors uploaded: 118.


In [62]:
query = "when can I apply for bachelors?"
query_embedding = get_embedding(query, engine="text-embedding-ada-002")
response = index.query([query_embedding], top_k=2, namespace=clientURL, include_metadata=True)

In [63]:
from IPython.display import display, Markdown
printmd = lambda x: display(Markdown(x))
printmd(f"#### A sample response from Pinecone \n ==============\n \n ```python\n{response}\n```")

#### A sample response from Pinecone 
 ==============
 
 ```python
{'matches': [{'id': 'Masters Admissions Are you applying with an incomplete '
                    'degree?',
              'metadata': {'Source': 'https://www.aalto.fi/en/study-at-aalto/applying-to-masters-programmes',
                           'Text': 'Are you applying with an incomplete '
                                   'degree? You can apply to a master’s '
                                   'programme with an incomplete bachelor’s '
                                   'degree, provided that you will graduate by '
                                   '31 July 2023. If you are accepted to the '
                                   'university, you must deliver a certified '
                                   'copy of your degree certificate by 17 '
                                   'August 2023 at 15.00 (3 pm, GMT +3). In '
                                   'case you are not able to graduate or fail '
                                   'to deliver your degree certificate by the '
                                   'deadline, your conditionally granted study '
                                   'right will be cancelled. If you apply with '
                                   'an incomplete degree, note that it may not '
                                   'be possible to graduate during the summer '
                                   'months. You should look into the '
                                   'graduation schedules and requirements of '
                                   'your own faculty or school in good time. '
                                   'Only studies included in an official '
                                   'transcript of records submitted with the '
                                   'online application form by the deadline (9 '
                                   'January 2023, 3 pm) can be taken into '
                                   'consideration in the evaluation. Updated '
                                   'transcripts or any studies completed after '
                                   'the deadline cannot be taken into '
                                   'consideration.',
                           'Title': 'Masters Admissions Are you applying with '
                                    'an incomplete degree?',
                           'Tokens': 203.0},
              'score': 0.85665977,
              'values': []},
             {'id': 'Admission Services Applying to Bachelors Programmes 1',
              'metadata': {'Source': 'https://www.aalto.fi/en/admission-services/applying-to-bachelors-programmes',
                           'Text': "Applying to Bachelor's Programmes: On this "
                                   'page (link: '
                                   'https://www.aalto.fi/en/admission-services/applying-to-bachelors-programmes '
                                   ") are listed all Aalto University's "
                                   "bachelor's programmes taught in English. "
                                   'Applying is possible once a year during '
                                   'the national spring joint application to '
                                   'higher education. You can access the study '
                                   'option descriptions and applying '
                                   'guidelines through the links below. The '
                                   "application period for bachelor's "
                                   'programmes in English is 4–18 January 2023 '
                                   'for studies starting in autumn 2023. The '
                                   'admission requirements and applying '
                                   'guidelines are published on the Aalto '
                                   'website and in the national Studyinfo '
                                   'service. Study option descriptions are '
                                   'available on our website. The application '
                                   'form is filled out in the Studyinfo '
                                   'service. Below you can find links to the '
                                   'study options both on the Aalto website '
                                   'and in the Studyinfo service. All Aalto '
                                   "University's study options and programmes "
                                   'can be found at: '
                                   'https://www.aalto.fi/en/study-options. '
                                   "Bachelor's programmes in English "
                                   '(application period: 4–18 January 2023) '
                                   "Field of Art and Design: Bachelor's "
                                   'Programme in Design (Link: '
                                   'https://www.aalto.fi/en/study-options/design-bachelor-of-arts-master-of-arts-art-and-design '
                                   '). Field of Business and Economics: '
                                   "Bachelor's Programme in Economics (Link: "
                                   'https://www.aalto.fi/en/study-options/bachelors-programme-in-economics '
                                   "). Bachelor's Programme in International "
                                   'Business (Link: '
                                   'https://www.aalto.fi/en/study-at-aalto/admission-to-the-bachelors-programme-in-international-business-bachelor-mikkeli-and-master-otaniemi '
                                   '). ',
                           'Title': 'Admission Services Applying to Bachelors '
                                    'Programmes 1',
                           'Tokens': 336.0},
              'score': 0.833929479,
              'values': []}],
 'namespace': 'https://aalto.fi'}
```

In [68]:
def get_context_string(response):
    context_string = "--- Start Context ---\n"
    for i in response['matches']:
        context_string += str(i['metadata']['Text'])
        context_string += "\n"
    context_string += "SOURCES:\n"
    for i in response['matches']:
        context_string += str(i['metadata']['Source'])
        context_string += "\n"
    context_string += "--- End Context ---"
    return context_string

print(get_context_string(response))

--- Start Context ---
Are you applying with an incomplete degree? You can apply to a master’s programme with an incomplete bachelor’s degree, provided that you will graduate by 31 July 2023. If you are accepted to the university, you must deliver a certified copy of your degree certificate by 17 August 2023 at 15.00 (3 pm, GMT +3). In case you are not able to graduate or fail to deliver your degree certificate by the deadline, your conditionally granted study right will be cancelled. If you apply with an incomplete degree, note that it may not be possible to graduate during the summer months. You should look into the graduation schedules and requirements of your own faculty or school in good time. Only studies included in an official transcript of records submitted with the online application form by the deadline (9 January 2023, 3 pm) can be taken into consideration in the evaluation. Updated transcripts or any studies completed after the deadline cannot be taken into consideration.
A