#### Installing Needed Package

In [46]:
#!pip install openai

#### Importing Necessary Packages

In [47]:
import pandas as pd
import openai, numpy as np
from openai.embeddings_utils import get_embedding, cosine_similarity
from sklearn.metrics import classification_report, accuracy_score

#### Retrieving the API Key

In [48]:
api_key = 'fake-api-key'
openai.api_key = api_key

#### Quick side example on how EMbeddings work

In [49]:
resp = openai.Embedding.create(
    input=["eating food", "I am hungry", "I am traveling", "exploring new places"],
    engine="text-similarity-davinci-001" 
)

embedding_a = resp['data'][0]['embedding']
embedding_b = resp['data'][1]['embedding']
embedding_c = resp['data'][2]['embedding']
embedding_d = resp['data'][3]['embedding']

np.dot(embedding_a, embedding_b) # both vectors are related so dot prod is close to 1

0.8743983922527889

#### Loading Dataset that contains Embeddings

In [50]:
data_path = "https://cdn.openai.com/API/examples/data/fine_food_reviews_with_embeddings_1k.csv"
df = pd.read_csv(data_path)
# Changing from str to Array
df["babbage_similarity"] = df["babbage_similarity"].apply(eval).apply(np.array)
df["babbage_search"] = df["babbage_search"].apply(eval).apply(np.array)

#### Implementing Semantic Search

In [51]:
def search_reviews(df, search_query, n=3):
  embedding = get_embedding(
      search_query,
      engine="text-search-babbage-query-001"
  )
  df['similarities'] = df.babbage_search.apply(lambda x: cosine_similarity(x, embedding))
  top_n = df.sort_values("similarities", ascending=False).head(n)
  return top_n

In [52]:
# embeding has to be created in real time for each search
res = search_reviews(df, "sweet rice", n=3)
res['combined'].to_list()

['Title: I love this stuff; Content: Hard to find in the grocery. I buy it by the case online. One box makes four lunches with some blackened chicken for protein. There is something about the sweet pineapple and curry that makes this rice mix delicious.',
 'Title: Whole Grain Food for great Health Benefit; Content: I love the taste and ease of cooking this rice.  The reason for making the purchase was because of the fact that is is a whole grain food item with a great deal of health benefits.  Made the puchase for a diabetic because it is a low glycemic food item.<br /><br />I must say that everyone who I have shared the Black Rice with enjoyed it the most when it was cooked in a broth.<br /><br />I like the texture and taste with or without broth.  I must say that the health benefits of the rice is what I was seeking and have found in this rice.  Worth every penny that I spent on it.',
 'Title: It\'s FANTASTIC! Mixes with Many kinds of Soups, etc. Yes.; Content: I just love it, and I 

#### Splitting dataset into Training and Testing

In [53]:
from sklearn.model_selection import train_test_split

X, y = list(df.babbage_similarity.values), df.Score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Implementing Classification by Logistic Regression

In [54]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print(f"Accuracy is: {accuracy_score(y_pred, y_test)}")
print(f"{classification_report(y_pred, y_test)}")

Accuracy is: 0.755
              precision    recall  f1-score   support

           1       0.72      0.45      0.55        29
           2       0.00      0.00      0.00         0
           3       0.12      1.00      0.22         1
           4       0.31      0.50      0.38        16
           5       0.98      0.84      0.91       154

    accuracy                           0.76       200
   macro avg       0.43      0.56      0.41       200
weighted avg       0.89      0.76      0.81       200



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Implementing Classification by K-Nearest Neighbors Method

In [55]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(f"Accuracy is: {accuracy_score(y_pred, y_test)}")
print(f"{classification_report(y_pred, y_test)}")

Accuracy is: 0.73
              precision    recall  f1-score   support

           1       0.89      0.48      0.63        33
           2       0.12      0.40      0.18         5
           3       0.12      0.08      0.10        13
           4       0.19      0.45      0.27        11
           5       0.93      0.88      0.91       138

    accuracy                           0.73       200
   macro avg       0.45      0.46      0.42       200
weighted avg       0.81      0.73      0.76       200



Implementing Clustering by KMeans

In [56]:
from sklearn.cluster import KMeans
dataset = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'Horse is eating grass',
          'A man is eating pasta.',
          'A woman is eating Biryani.',
          'The girl is carrying a baby.',
          'The baby is carried by the woman.',
          'A man is riding a horse.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'Someone in a gorilla suit is playing a set of drums.',
          'A cheetah is running behind its prey.',
          'A cheetah chases its prey across a field.',
          'The cheetah is chasing a man who is riding a horse.',
          'man and woman with their baby are watching a cheetah in the zoo.'
]

# Embedding dataset for similarity
resp = openai.Embedding.create(
    input=dataset,
    engine="text-similarity-davinci-001" 
)

resp_embeddings = [d['embedding'] for d in resp['data']] 
# Normalizing
resp_embeddings = resp_embeddings / np.linalg.norm(resp_embeddings, axis=1, keepdims=True)

km_cl = KMeans(n_clusters=3)
km_cl.fit(resp_embeddings)
clusters = km_cl.labels_
print(clusters)

# Seperating dataset into clusters
clustered_dataset = {}
for sentence_id, cluster_id in enumerate(clusters):
  if cluster_id not in clustered_dataset:
    clustered_dataset[cluster_id] = []
  clustered_dataset[cluster_id].append(dataset[sentence_id])

print(clustered_dataset)

[0 0 0 0 0 2 2 0 1 1 1 1 1 1 1]
{0: ['A man is eating food.', 'A man is eating a piece of bread.', 'Horse is eating grass', 'A man is eating pasta.', 'A woman is eating Biryani.', 'A man is riding a horse.'], 2: ['The girl is carrying a baby.', 'The baby is carried by the woman.'], 1: ['A man is riding a white horse on an enclosed ground.', 'A monkey is playing drums.', 'Someone in a gorilla suit is playing a set of drums.', 'A cheetah is running behind its prey.', 'A cheetah chases its prey across a field.', 'The cheetah is chasing a man who is riding a horse.', 'man and woman with their baby are watching a cheetah in the zoo.']}


###### credits: https://www.youtube.com/watch?v=ld3YbhoJz9w&list=PLAMHV77MSKJ4QOIS86OiXMtb3-4TUUzho&index=6