In [12]:
import pandas as pd
df = pd.read_parquet('olddata.parquet')

In [13]:
df.head()

Unnamed: 0,Subject,Sender,Date,Labels,Body,Replied To
0,"VINEET, Remember To Purchase Your Getaway Star...",Hilton Grand Vacations <hgv@travel2.hiltongran...,1683869000.0,"CATEGORY_PROMOTIONS, UNREAD, INBOX,",Your 3-Night Vacation Can Get You $100 Towards...,0
1,Spring Recital next Saturday May 20th at 5:00,Andrew Bushnell <fiddlersroof@outlook.com>,1683868000.0,"UNREAD, IMPORTANT, CATEGORY_PERSONAL, INBOX,",,0
2,[Class of 2028] Save the date - 7th Grade End ...,Tushar Gupta <m@mail1.veracross.com>,1683867000.0,"UNREAD, IMPORTANT, CATEGORY_UPDATES, INBOX,","Hello class of 2028 families, \r\nWe are plann...",0
3,[Mantri Celestia] Special Notice: Power Shutdo...,Mantri Celestia helpdesk <donotreply@apnacompl...,1683866000.0,"UNREAD, CATEGORY_UPDATES, INBOX,",,0
4,We're Giving You Up to 40% Off Sale Styles,Banana Republic <bananarepublic@email.bananare...,1683864000.0,"CATEGORY_PROMOTIONS, UNREAD, INBOX,",\r\n\r\nWe're Giving You Up to 40% Off Sale St...,0


In [14]:
#find all rows where the date is None
df[df['Date'].isnull()]

#delete these rows
df = df.dropna(subset=['Date'])

In [15]:
#print the number of Replied To emails
print(len(df[df['Replied To'] == True]))

870


In [17]:
df = df.groupby('Replied To').head(870)
df.value_counts('Replied To')

Replied To
0    870
1    870
Name: count, dtype: int64

In [18]:
#show 20 random emails
df.sample(20)

Unnamed: 0,Subject,Sender,Date,Labels,Body,Replied To
36120,"Fw: Khushboo - Inquiry Regarding Aryav, Sophomore",Khushboo Taneja <khushboo.taneja@live.com>,1655744000.0,"IMPORTANT, STARRED, CATEGORY_PERSONAL, INBOX,",\r\n________________________________\r\nFrom: ...,1
834,40% off all our favorite styles + an extra 20%,Banana Republic Factory <bananarepublicfactory...,1683195000.0,"CATEGORY_PROMOTIONS, UNREAD, INBOX,",\r\n\r\nBanana Republic Factory Store\r\nhttps...,0
36130,Fwd: Solomon Admissions Consulting - packages,Khushboo Taneja <khushboo.taneja@live.com>,1655738000.0,"IMPORTANT, CATEGORY_PERSONAL, INBOX,",,1
53789,Re: [Greystone]: Community Trip Saver App survey,Vibha Rathi <vibharathi@gmail.com>,1641589000.0,"IMPORTANT, CATEGORY_FORUMS, INBOX,","Dear neighbors,\r\nThanks for filling our our ...",1
493,There’s still time to ship for M-Day,DoorDash <no-reply@doordash.com>,1683473000.0,"CATEGORY_PROMOTIONS, UNREAD, INBOX,",DoorDash Get 50% off shipped flowers & sweets ...,0
7775,Fw: Permission form for March 12th MS Debate t...,Khushboo Taneja <khushboo.taneja@live.com>,1677642000.0,"IMPORTANT, CATEGORY_PERSONAL, INBOX,",,1
850,"Etsy, Inc. (ETSY) Q1 2023 Earnings Call Transc...",SA Analysis <account@seekingalpha.com>,1683165000.0,"UNREAD, CATEGORY_UPDATES, INBOX,","<!DOCTYPE html><html xmlns:v=""urn:schemas-micr...",0
520,,,1683451000.0,"CATEGORY_PROMOTIONS, UNREAD, INBOX,",Hidden bathroom remodeling costs to avoid. Plu...,1
53414,Professional Resume Services -- First Draft At...,Rene Hart <rene@professionalresumeservices.com>,1641944000.0,"IMPORTANT, CATEGORY_PERSONAL, INBOX,",,1
34355,"RE: GM at Taj Vivanta, Dwarka?",Anubhav Makhija <anubhav.makhija@tajhotels.com>,1657110000.0,"IMPORTANT, CATEGORY_PERSONAL, INBOX,",,1


In [19]:
#drop the labels column
df = df.drop(columns=['Labels'])

#show the datatypes
df.dtypes

Subject        object
Sender         object
Date          float64
Body           object
Replied To      int64
dtype: object

In [24]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization, Normalization
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

numerical_data = df['Date']
labels = df['Replied To']

#concatenate all of the textual data into one string for each email
textual_data = 'Subject: ' + df['Subject'] + ' Body: ' + df['Body'] + ' Sender: ' + df['Sender']

#vectorize the textual data
vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200, output_mode='int')
vectorizer.adapt(textual_data)

#normalize the numerical data
normalizer = Normalization(axis=None)
normalizer.adapt(numerical_data)

features = tf.data.Dataset.from_tensor_slices((textual_data.values, numerical_data.values))
labels = tf.data.Dataset.from_tensor_slices(labels.values)
dataset = tf.data.Dataset.zip((features, labels))

dataset = dataset.shuffle(buffer_size=10000).batch(32)

#show the setup of the dataset
for batch in dataset.take(1):
    print(batch)

0        Subject: VINEET, Remember To Purchase Your Get...
1        Subject: Spring Recital next Saturday May 20th...
2        Subject: [Class of 2028] Save the date - 7th G...
3        Subject: [Mantri Celestia] Special Notice: Pow...
4        Subject: We're Giving You Up to 40% Off Sale S...
                               ...                        
57826    Subject: Re: ITR for AY 2021-22 Body: Dear Sir...
57888    Subject:  Body: Last chance to get up to 80% o...
57971    Subject:  Body: Up to 80% off ends soon! Shop ...
58063    Subject:  Body: On-sale upgrades for every roo...
58123    Subject: Re: ITR for AY 2021-22 Body:  Sender:...
Length: 1740, dtype: object


In [126]:
#build a model

text_input = keras.Input(shape=(1,), dtype=tf.string, name='text')
num_input = keras.Input(shape=(1,), dtype=tf.float32, name='num')

#make a large model, since the data in each email is large
embedding = layers.Embedding(input_dim=20000, output_dim=256, mask_zero=True)(vectorizer(text_input))
lstm = layers.LSTM(256)(embedding)
dense = layers.Dense(128, activation='relu')(num_input)
concat = layers.Concatenate()([lstm, dense])
output = layers.Dense(1, activation='sigmoid')(concat)

model = keras.Model(inputs=[text_input, num_input], outputs=output)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None, 1)]          0           []                               
                                                                                                  
 text_vectorization_11 (TextVec  (None, 200)         0           ['text[0][0]']                   
 torization)                                                                                      
                                                                                                  
 embedding_14 (Embedding)       (None, 200, 256)     5120000     ['text_vectorization_11[4][0]']  
                                                                                                  
 num (InputLayer)               [(None, 1)]          0           []                         

In [21]:
model = keras.models.load_model('model.tf')

In [22]:
train = dataset.take(10000)
test = dataset.skip(10000)
val = test.skip(10000)

model.fit(
    train,
    validation_data=val,
    epochs=50,
    callbacks=[keras.callbacks.ReduceLROnPlateau(monitor='accuracy', patience=3, verbose=1, factor=.5)]
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 8: ReduceLROnPlateau reducing learning rate to 1.907348723406699e-09.
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 11: ReduceLROnPlateau reducing learning rate to 9.536743617033494e-10.
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 14: ReduceLROnPlateau reducing learning rate to 4.768371808516747e-10.
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 17: ReduceLROnPlateau reducing learning rate to 2.3841859042583735e-10.
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 20: ReduceLROnPlateau reducing learning rate to 1.1920929521291868e-10.
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 23: ReduceLROnPlateau reducing learning rate to 5.960464760645934e-11.
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 26: ReduceLROnPlateau reducing learning rate to 2.980232380322967e-11.
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 29: ReduceLROnPlateau reducing learning rate to 1.4901161901614834e-11.
Epoch 30/50
Epoch 31/50
Epoch 32/50

KeyboardInterrupt: 

In [23]:
model.save('model')



INFO:tensorflow:Assets written to: model\assets


INFO:tensorflow:Assets written to: model\assets
