# ChatGPT API: Get Embeddings

## The Association for Computational Linguistics
## WASSA 2023 Shared Task on Multi-Label and Multi-Class Emotion Classification on Code-Mixed Text Messages
See more details [here](https://codalab.lisn.upsaclay.fr/competitions/10864#learn_the_details)

In [141]:
import openai
import numpy as np
import pandas as pd
import sklearn
import re, os
import time
import zipfile, pickle
from typing import List
from copy import deepcopy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, RidgeClassifier, RidgeClassifierCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, multilabel_confusion_matrix
from openai.embeddings_utils import cosine_similarity
from tqdm.autonotebook import tqdm
import random
import tiktoken
import backoff
tqdm.pandas()

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 400)
#os.path.join()

In [3]:
random_state = 47

# Load and Prepare Data

In [135]:
file1    = 'data/mcec_train_translated.pkl'
df_train = pd.read_pickle(file1)

file2    = 'data/mcec_dev_translated.pkl'
df_dev   = pd.read_pickle(file2)

file3    = 'data/mcec_test.csv'
df_test  = pd.read_csv(file3)

file4    = 'data/sample_submission/predictions_MCEC.csv'
sample_submission = pd.read_csv(file4)

print(df_train.shape, df_dev.shape, df_test.shape, sample_submission.shape)

(9530, 4) (1191, 11) (1191, 1) (1191, 1)


# OpenAI Embeddings

In [None]:
def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

#df_train['embedding'] = df_train['text'].progress_apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
start  = time.time()
count1 = 0
embeddings = dict()
for t in df_train['text'].values:
    if t in embeddings:
        continue
    try:
        embeddings[t] = get_embedding(t)
    except Exception as e:
        print(f'\nText: {t}. Error: {e}\n')
    
    count1 += 1
    if count1 % 10 == 0:
        print(f'Processing text {count1}. Time elapsed: {round((time.time()-start)/60, 4)} min')
        #with open('data/embeddings.pkl', 'wb') as f:
        #    pickle.dump(embeddings, f, protocol=pickle.HIGHEST_PROTOCOL)                        
        
elapsed = (time.time() - start)/60
print(f'\nTime elapsed {round(elapsed, 4)} min')

Processing text 10. Time elapsed: 0.1216 min
Processing text 20. Time elapsed: 0.2151 min
Processing text 30. Time elapsed: 0.312 min
Processing text 40. Time elapsed: 0.4083 min
Processing text 50. Time elapsed: 0.5004 min
Processing text 60. Time elapsed: 0.5927 min
Processing text 70. Time elapsed: 0.6933 min
Processing text 80. Time elapsed: 0.7939 min
Processing text 90. Time elapsed: 0.9227 min
Processing text 100. Time elapsed: 1.04 min
Processing text 110. Time elapsed: 1.1781 min
Processing text 120. Time elapsed: 1.3585 min
Processing text 130. Time elapsed: 1.5407 min
Processing text 140. Time elapsed: 1.6967 min
Processing text 150. Time elapsed: 1.8289 min
Processing text 160. Time elapsed: 1.9622 min
Processing text 170. Time elapsed: 2.0962 min
Processing text 180. Time elapsed: 2.2301 min
Processing text 190. Time elapsed: 2.3756 min
Processing text 200. Time elapsed: 2.5165 min
Processing text 210. Time elapsed: 2.7212 min


In [160]:
df_train['gpt_embedding'] = df_train['text'].map( embeddings )
print(df_train.isna().sum())
file1    = 'data/mcec_train_translated.pkl'
df_train.to_pickle(file1)

text                0
emotion             0
translated_hi       5
translated_ur       0
gpt_embedding       0
embedding        8803
dtype: int64


In [162]:
df_dev['gpt_embedding'] = df_dev['text'].map( embeddings )
print(df_dev.isna().sum())
file2    = 'data/mcec_dev_translated.pkl'
df_dev.to_pickle(file2)

text                         0
emotion                      0
target                       0
gtp_translated               0
translated_hi                0
translated_ur                0
text_clean                   0
gpt_pred                     0
gpt_pred_num                 0
gpt_translated2              0
gpt_translated2_corrected    0
gpt_embedding                0
dtype: int64


In [158]:
df_test['gpt_embedding'] = df_test['Text'].map( embeddings )
print(df_test.isna().sum())
file3    = 'data/mcec_test_embedded.pkl'
df_test.to_pickle(file3)

Text             0
gpt_embedding    0
dtype: int64
