In [2]:
from dotenv import load_dotenv

import os

import openai
from openai.embeddings_utils import get_embedding
import pandas as pd
import tiktoken

load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")

embedding_model = "text-embedding-ada-002"
max_tokens = 8000

In [3]:
df = pd.read_parquet('olddata.parquet')
df = df.dropna(subset=['Replied To'])
df = df.drop(columns=['Labels'])
df.head()

Unnamed: 0,Subject,Sender,Date,Body,Replied To
0,"VINEET, Remember To Purchase Your Getaway Star...",Hilton Grand Vacations <hgv@travel2.hiltongran...,1683869000.0,Your 3-Night Vacation Can Get You $100 Towards...,0
1,Spring Recital next Saturday May 20th at 5:00,Andrew Bushnell <fiddlersroof@outlook.com>,1683868000.0,,0
2,[Class of 2028] Save the date - 7th Grade End ...,Tushar Gupta <m@mail1.veracross.com>,1683867000.0,"Hello class of 2028 families, \r\nWe are plann...",0
3,[Mantri Celestia] Special Notice: Power Shutdo...,Mantri Celestia helpdesk <donotreply@apnacompl...,1683866000.0,,0
4,We're Giving You Up to 40% Off Sale Styles,Banana Republic <bananarepublic@email.bananare...,1683864000.0,\r\n\r\nWe're Giving You Up to 40% Off Sale St...,0


In [4]:
df = df.groupby('Replied To').head(870)
df.head()

Unnamed: 0,Subject,Sender,Date,Body,Replied To
0,"VINEET, Remember To Purchase Your Getaway Star...",Hilton Grand Vacations <hgv@travel2.hiltongran...,1683869000.0,Your 3-Night Vacation Can Get You $100 Towards...,0
1,Spring Recital next Saturday May 20th at 5:00,Andrew Bushnell <fiddlersroof@outlook.com>,1683868000.0,,0
2,[Class of 2028] Save the date - 7th Grade End ...,Tushar Gupta <m@mail1.veracross.com>,1683867000.0,"Hello class of 2028 families, \r\nWe are plann...",0
3,[Mantri Celestia] Special Notice: Power Shutdo...,Mantri Celestia helpdesk <donotreply@apnacompl...,1683866000.0,,0
4,We're Giving You Up to 40% Off Sale Styles,Banana Republic <bananarepublic@email.bananare...,1683864000.0,\r\n\r\nWe're Giving You Up to 40% Off Sale St...,0


In [5]:
#convert the unix timestamp in each date to a human-readable text
df['Date'] = pd.to_datetime(df['Date'], unit='s')

df.head()

Unnamed: 0,Subject,Sender,Date,Body,Replied To
0,"VINEET, Remember To Purchase Your Getaway Star...",Hilton Grand Vacations <hgv@travel2.hiltongran...,2023-05-12 05:19:17,Your 3-Night Vacation Can Get You $100 Towards...,0
1,Spring Recital next Saturday May 20th at 5:00,Andrew Bushnell <fiddlersroof@outlook.com>,2023-05-12 04:58:48,,0
2,[Class of 2028] Save the date - 7th Grade End ...,Tushar Gupta <m@mail1.veracross.com>,2023-05-12 04:47:09,"Hello class of 2028 families, \r\nWe are plann...",0
3,[Mantri Celestia] Special Notice: Power Shutdo...,Mantri Celestia helpdesk <donotreply@apnacompl...,2023-05-12 04:36:01,,0
4,We're Giving You Up to 40% Off Sale Styles,Banana Republic <bananarepublic@email.bananare...,2023-05-12 04:01:26,\r\n\r\nWe're Giving You Up to 40% Off Sale St...,0


In [6]:
#create a new column with the combined text
df = df[['Subject', 'Sender', 'Date', 'Body', 'Replied To']]

#make a combined column with the subject, sender and body for each row
for index, row in df.iterrows():
    df.at[index, 'combined'] = f'Subject: {row["Subject"].strip()}; Sender: {row["Sender"].strip()}; Body: {row["Body"].strip()}'

df.head()

Unnamed: 0,Subject,Sender,Date,Body,Replied To,combined
0,"VINEET, Remember To Purchase Your Getaway Star...",Hilton Grand Vacations <hgv@travel2.hiltongran...,2023-05-12 05:19:17,Your 3-Night Vacation Can Get You $100 Towards...,0,"Subject: VINEET, Remember To Purchase Your Get..."
1,Spring Recital next Saturday May 20th at 5:00,Andrew Bushnell <fiddlersroof@outlook.com>,2023-05-12 04:58:48,,0,Subject: Spring Recital next Saturday May 20th...
2,[Class of 2028] Save the date - 7th Grade End ...,Tushar Gupta <m@mail1.veracross.com>,2023-05-12 04:47:09,"Hello class of 2028 families, \r\nWe are plann...",0,Subject: [Class of 2028] Save the date - 7th G...
3,[Mantri Celestia] Special Notice: Power Shutdo...,Mantri Celestia helpdesk <donotreply@apnacompl...,2023-05-12 04:36:01,,0,Subject: [Mantri Celestia] Special Notice: Pow...
4,We're Giving You Up to 40% Off Sale Styles,Banana Republic <bananarepublic@email.bananare...,2023-05-12 04:01:26,\r\n\r\nWe're Giving You Up to 40% Off Sale St...,0,Subject: We're Giving You Up to 40% Off Sale S...


In [7]:
encoding = tiktoken.encoding_for_model(embedding_model)

df["n_tokens"] = df["combined"].apply(lambda x: len(encoding.encode(x)))
df = df[df["n_tokens"] <= max_tokens]
len(df)

1457

In [65]:
df["embedding"] = df["combined"].apply(lambda x: get_embedding(x, engine=embedding_model))

In [66]:
df.head()

Unnamed: 0,Subject,Sender,Date,Body,Replied To,combined,n_tokens,embedding
0,"VINEET, Remember To Purchase Your Getaway Star...",Hilton Grand Vacations <hgv@travel2.hiltongran...,1683869000.0,Your 3-Night Vacation Can Get You $100 Towards...,0,"Subject: VINEET, Remember To Purchase Your Get...",3705,"[-0.008335459977388382, -0.007523390464484692,..."
1,Spring Recital next Saturday May 20th at 5:00,Andrew Bushnell <fiddlersroof@outlook.com>,1683868000.0,,0,Subject: Spring Recital next Saturday May 20th...,35,"[-0.03298372030258179, 0.0026079996023327112, ..."
2,[Class of 2028] Save the date - 7th Grade End ...,Tushar Gupta <m@mail1.veracross.com>,1683867000.0,"Hello class of 2028 families, \r\nWe are plann...",0,Subject: [Class of 2028] Save the date - 7th G...,155,"[-0.008779299445450306, -0.004117605742067099,..."
3,[Mantri Celestia] Special Notice: Power Shutdo...,Mantri Celestia helpdesk <donotreply@apnacompl...,1683866000.0,,0,Subject: [Mantri Celestia] Special Notice: Pow...,48,"[-0.00916092749685049, -0.021061832085251808, ..."
4,We're Giving You Up to 40% Off Sale Styles,Banana Republic <bananarepublic@email.bananare...,1683864000.0,\r\n\r\nWe're Giving You Up to 40% Off Sale St...,0,Subject: We're Giving You Up to 40% Off Sale S...,5416,"[-0.046990904957056046, -0.01633710041642189, ..."


In [67]:
#delete combined and n_tokens columns
df = df.drop(columns=['combined', 'n_tokens'])

In [68]:
df.head()

Unnamed: 0,Subject,Sender,Date,Body,Replied To,embedding
0,"VINEET, Remember To Purchase Your Getaway Star...",Hilton Grand Vacations <hgv@travel2.hiltongran...,1683869000.0,Your 3-Night Vacation Can Get You $100 Towards...,0,"[-0.008335459977388382, -0.007523390464484692,..."
1,Spring Recital next Saturday May 20th at 5:00,Andrew Bushnell <fiddlersroof@outlook.com>,1683868000.0,,0,"[-0.03298372030258179, 0.0026079996023327112, ..."
2,[Class of 2028] Save the date - 7th Grade End ...,Tushar Gupta <m@mail1.veracross.com>,1683867000.0,"Hello class of 2028 families, \r\nWe are plann...",0,"[-0.008779299445450306, -0.004117605742067099,..."
3,[Mantri Celestia] Special Notice: Power Shutdo...,Mantri Celestia helpdesk <donotreply@apnacompl...,1683866000.0,,0,"[-0.00916092749685049, -0.021061832085251808, ..."
4,We're Giving You Up to 40% Off Sale Styles,Banana Republic <bananarepublic@email.bananare...,1683864000.0,\r\n\r\nWe're Giving You Up to 40% Off Sale St...,0,"[-0.046990904957056046, -0.01633710041642189, ..."


In [72]:
#rename 'Replied To' to 'RepliedTo'
df = df.rename(columns={'Replied To': 'RepliedTo'})
df.head()

Unnamed: 0,Subject,Sender,Date,Body,RepliedTo,embedding
0,"VINEET, Remember To Purchase Your Getaway Star...",Hilton Grand Vacations <hgv@travel2.hiltongran...,1683869000.0,Your 3-Night Vacation Can Get You $100 Towards...,0,"[-0.008335459977388382, -0.007523390464484692,..."
1,Spring Recital next Saturday May 20th at 5:00,Andrew Bushnell <fiddlersroof@outlook.com>,1683868000.0,,0,"[-0.03298372030258179, 0.0026079996023327112, ..."
2,[Class of 2028] Save the date - 7th Grade End ...,Tushar Gupta <m@mail1.veracross.com>,1683867000.0,"Hello class of 2028 families, \r\nWe are plann...",0,"[-0.008779299445450306, -0.004117605742067099,..."
3,[Mantri Celestia] Special Notice: Power Shutdo...,Mantri Celestia helpdesk <donotreply@apnacompl...,1683866000.0,,0,"[-0.00916092749685049, -0.021061832085251808, ..."
4,We're Giving You Up to 40% Off Sale Styles,Banana Republic <bananarepublic@email.bananare...,1683864000.0,\r\n\r\nWe're Giving You Up to 40% Off Sale St...,0,"[-0.046990904957056046, -0.01633710041642189, ..."


In [73]:
#save as data.parquet
df.to_parquet('data.parquet')