In [22]:
import pandas as pd
from src.ticket_messages import TicketMessage
import ast
import swifter
import re


In [2]:
df = pd.read_parquet("data/classification_dataset")

In [3]:
df.tail()

Unnamed: 0,account_id,ticket_id,raw_body,channel,unix_timestamp,contact_reason,processed_body,email_sentence_embeddings
56167,47355,97704266,-7329393197999221857,email,1683747000.0,"""Pre-sale::Place an order""",7769862566240229096,"{""7076610397638084948"": [0.04829759523272514, ..."
56168,47355,97104492,-712526401448959206,email,1683821000.0,"""No Action Req::Others""",8225526187121867386,"{""-4707052099692602634"": [0.04418250918388367,..."
56169,47355,98114244,-6396575246795551664,email,1683823000.0,"""No Action Req::Others""",0,
56170,47355,98375668,8956876459394868202,email,1684164000.0,"""Tech Supp::Tech_Query""",4884952985295307657,"{""-6790423425691970518"": [-0.10473562777042389..."
56171,47355,98834917,3230166170796936168,email,1684271000.0,"""Shipping::Tracking""",1023372247625818182,"{""1824102601729972162"": [-0.02185063622891903,..."


In [4]:
df.shape

(56172, 8)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56172 entries, 0 to 56171
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   account_id                 56172 non-null  object 
 1   ticket_id                  56172 non-null  object 
 2   raw_body                   56172 non-null  int64  
 3   channel                    56172 non-null  object 
 4   unix_timestamp             56172 non-null  float64
 5   contact_reason             56172 non-null  object 
 6   processed_body             56172 non-null  int64  
 7   email_sentence_embeddings  50719 non-null  object 
dtypes: float64(1), int64(2), object(5)
memory usage: 3.4+ MB


In [6]:
df.contact_reason.nunique()

1776

In [7]:
df.ticket_id.nunique()

56172

In [8]:
df.account_id.nunique()

67

In [9]:
df[df["email_sentence_embeddings"] == ""]

Unnamed: 0,account_id,ticket_id,raw_body,channel,unix_timestamp,contact_reason,processed_body,email_sentence_embeddings


In [10]:
df[df["email_sentence_embeddings"].isna()].shape

(5453, 8)

In [11]:
df[df["email_sentence_embeddings"].isna()].processed_body.nunique()

713

In [12]:
df[df["email_sentence_embeddings"].isna()].groupby("processed_body").contact_reason.nunique().sort_values()

processed_body
-9211444432197865303      1
 2506378290563214752      1
 2513959075871035925      1
 2542718104208950957      1
 2571804879116554174      1
                       ... 
 121320659668196706       9
-5442437062721334598     10
 8659924373827760716     33
 9200394132716385379    146
 0                      467
Name: contact_reason, Length: 713, dtype: int64

In [13]:
df = df[~df.email_sentence_embeddings.isna()]

In [14]:
df["email_sentence_embeddings"] = df["email_sentence_embeddings"].swifter.apply(lambda x: ast.literal_eval(x) if x is not None else x)

Pandas Apply: 100%|█████████████████████████████████████████████████████████████| 50719/50719 [06:25<00:00, 131.49it/s]


In [15]:
df["nb_sentences"] = df.email_sentence_embeddings.swifter.apply(len)

Pandas Apply: 100%|██████████████████████████████████████████████████████████| 50719/50719 [00:00<00:00, 539568.72it/s]


In [16]:
df.nb_sentences.describe(percentiles=[0.99])

count    50719.000000
mean         2.306000
std          4.758009
min          1.000000
50%          1.000000
99%         16.000000
max        225.000000
Name: nb_sentences, dtype: float64

In [18]:
df.groupby("contact_reason").ticket_id.nunique().describe(percentiles=[0.99])

count    1707.000000
mean       29.712361
std       105.859004
min         1.000000
50%         5.000000
99%       352.920000
max      1829.000000
Name: ticket_id, dtype: float64

In [29]:
text = "Subscription:: Cancel"

cleaned_text = re.sub(r'\s*::\s*', '::', text)

In [30]:
print(cleaned_text)

Subscription::Cancel


In [38]:
clean_classes = []
for c in df.contact_reason:
    clean_classes.append(re.sub(r'\s*::\s*', '::', c).replace('"', '').strip().lower())

In [39]:
len(set(clean_classes)), len(clean_classes)

(1681, 50719)

In [41]:
for c in set(clean_classes):
    print(c)

affiliate::wholesale::fit body boot camp
system::discount code issue
refunds & returns
order::i need to change my order
purchasing::international order::international quote request
rwp::disp
cancel request::too much
order arrived late
shipping::policy::international
store message::order question
received wrong product
damaged::defective
ambassador request
order confirmation/vat receipt
discount::newsletter
product information::product repair
wholesale::website/online account issue
post-purchase
amazon::bubbling mats
production::delayed order
sales support::product info::stk result query
site tech::complaints::other site/tech issues
other::thank you
hotjar::exit::price
sales support::social comment info only::social comment info only
problem::address change
place order::purchase order
order support::order status::query on product application after order
clean nutrition
fulfillment::wismo
wholesale::cancellation::full order
shipping - change my order (edit,sizes)
pre-sales::quote request

In [47]:
import numpy as np

data_dict = {
    'key1': [1, 2, 3, 4, 5],
}

values_array = np.array(list(data_dict.values()))

mean_value = np.mean(values_array, axis=0)

print(mean_value)

[1. 2. 3. 4. 5.]


In [50]:
values_array.shape

(1, 5)

In [4]:
ticket_message_list = TicketMessage.from_dataframe(dataframe=df)

In [5]:
i = 13524

print(f"Account id = {ticket_message_list[i].account_id}")
print(f"Ticket id = {ticket_message_list[i].ticket_id}")
print(f"Contact Reason = ", ticket_message_list[i].contact_reason)
if ticket_message_list[i].email_sentence_embeddings:
    print(f"Number of sentences in email = ", len(ticket_message_list[i].email_sentence_embeddings))
else:
    print(f"Number of sentences in email = ", 0)

Account id = 4453
Ticket id = 375526423
Contact Reason =  "Post-purchase::Other::Other"
Number of sentences in email =  3
