In [1]:
import pandas as pd
import json

In [2]:
tickets_df = pd.read_json("../data/technical_test_data.json", orient='records')
tickets_df = tickets_df.drop(columns="id")
tickets_df.head()

Unnamed: 0,Description,tags,created_at,cc_emails,fwd_emails,reply_cc_emails,ticket_cc_emails,fr_escalated,spam,email_config_id,...,priority,requester_id,responder_id,source,company_id,status,subject,association_type,support_email,to_emails
0,"- __EMAIL__ Hi , I have just ordered a pair of...","[category-1, missing-items, field-1456]",2020-03-15,[],[],[],[],False,False,750498,...,1,30,304,1,,5,Customer Request,,digitalgenius.com,['support@digitalgenius.com']
1,I am missing a pair of shoes from my order. Co...,"[missing-items, open-ticket]",2021-01-18,[],[],[],[],False,False,750498,...,1,30,304,1,,5,Customer Request,,digitalgenius.com,['support@digitalgenius.com']
2,I didn'tget a my order - __EMAIL__,"[category-1, missing-items]",2019-08-01,[],[],[],[],False,False,750498,...,1,30,304,1,,5,Customer Request,,digitalgenius.com,['support@digitalgenius.com']
3,"Hello, I ordered two __PRODUCTS_NAMES and one ...","[category-1, missing-items, open-ticket, field...",2020-08-27,[],[],[],[],False,False,750498,...,1,30,304,1,,5,Customer Request,,digitalgenius.com,['support@digitalgenius.com']
4,My shipment never was delivered. The tracking ...,"[category-1, missing-items, open-ticket]",2019-07-21,[],[],[],[],False,False,750498,...,1,30,304,1,,5,Customer Request,,digitalgenius.com,['support@digitalgenius.com']


In [3]:
# Convert list valued columns to tuples for the next preprocessing step
# In order to call pd.Dataframe.unique, values need to be hashable, and so we are converting lists into tuples
list_valued_columns = ['tags', 'cc_emails', 'fwd_emails', 'reply_cc_emails', 'ticket_cc_emails', 'to_emails']
for col in list_valued_columns:  
    tickets_df[col] = tickets_df[col].apply(tuple)
tickets_df.head()

Unnamed: 0,Description,tags,created_at,cc_emails,fwd_emails,reply_cc_emails,ticket_cc_emails,fr_escalated,spam,email_config_id,...,priority,requester_id,responder_id,source,company_id,status,subject,association_type,support_email,to_emails
0,"- __EMAIL__ Hi , I have just ordered a pair of...","(category-1, missing-items, field-1456)",2020-03-15,"([, ])","([, ])","([, ])","([, ])",False,False,750498,...,1,30,304,1,,5,Customer Request,,digitalgenius.com,"([, ', s, u, p, p, o, r, t, @, d, i, g, i, t, ..."
1,I am missing a pair of shoes from my order. Co...,"(missing-items, open-ticket)",2021-01-18,"([, ])","([, ])","([, ])","([, ])",False,False,750498,...,1,30,304,1,,5,Customer Request,,digitalgenius.com,"([, ', s, u, p, p, o, r, t, @, d, i, g, i, t, ..."
2,I didn'tget a my order - __EMAIL__,"(category-1, missing-items)",2019-08-01,"([, ])","([, ])","([, ])","([, ])",False,False,750498,...,1,30,304,1,,5,Customer Request,,digitalgenius.com,"([, ', s, u, p, p, o, r, t, @, d, i, g, i, t, ..."
3,"Hello, I ordered two __PRODUCTS_NAMES and one ...","(category-1, missing-items, open-ticket, field...",2020-08-27,"([, ])","([, ])","([, ])","([, ])",False,False,750498,...,1,30,304,1,,5,Customer Request,,digitalgenius.com,"([, ', s, u, p, p, o, r, t, @, d, i, g, i, t, ..."
4,My shipment never was delivered. The tracking ...,"(category-1, missing-items, open-ticket)",2019-07-21,"([, ])","([, ])","([, ])","([, ])",False,False,750498,...,1,30,304,1,,5,Customer Request,,digitalgenius.com,"([, ', s, u, p, p, o, r, t, @, d, i, g, i, t, ..."


In [4]:
# Drop all columns that are single-valued (i.e. only one value is present in that particular columns for all rows)
# These columns do not provide any additional information in our classification process
redundant_columns = []
# For each column in the dataframe, check if it is single-valued and if yes, append it to the redundant_columns list
for col in tickets_df.columns:
    if len(tickets_df[col].unique()) == 1:
        redundant_columns.append(col)
# Drop the columns
tickets_df = tickets_df.drop(columns=redundant_columns)
tickets_df.head()

Unnamed: 0,Description,tags,created_at
0,"- __EMAIL__ Hi , I have just ordered a pair of...","(category-1, missing-items, field-1456)",2020-03-15
1,I am missing a pair of shoes from my order. Co...,"(missing-items, open-ticket)",2021-01-18
2,I didn'tget a my order - __EMAIL__,"(category-1, missing-items)",2019-08-01
3,"Hello, I ordered two __PRODUCTS_NAMES and one ...","(category-1, missing-items, open-ticket, field...",2020-08-27
4,My shipment never was delivered. The tracking ...,"(category-1, missing-items, open-ticket)",2019-07-21


In [5]:
# The created_at column is not useful for what we need, so we remove it as well.
tickets_df = tickets_df.drop(columns='created_at')
tickets_df.head()

Unnamed: 0,Description,tags
0,"- __EMAIL__ Hi , I have just ordered a pair of...","(category-1, missing-items, field-1456)"
1,I am missing a pair of shoes from my order. Co...,"(missing-items, open-ticket)"
2,I didn'tget a my order - __EMAIL__,"(category-1, missing-items)"
3,"Hello, I ordered two __PRODUCTS_NAMES and one ...","(category-1, missing-items, open-ticket, field..."
4,My shipment never was delivered. The tracking ...,"(category-1, missing-items, open-ticket)"


### Finding the right target variable

We now have the most relevant columns: the actual ticket message and the assigned tags. \
What we need to do now is to find a way to extract the target variable from the tags column. \
I will split the tags column into multiple columns. I will actually one-hot-encode this column in order to manually check what these tags tell us about the messages.

In [6]:
# Obtain the list of all the tags
all_tags = tickets_df["tags"].apply(list).to_list()
all_tags = [element for sublist in all_tags for element in sublist]
all_tags

['category-1',
 'missing-items',
 'field-1456',
 'missing-items',
 'open-ticket',
 'category-1',
 'missing-items',
 'category-1',
 'missing-items',
 'open-ticket',
 'field-1456',
 'category-1',
 'missing-items',
 'open-ticket',
 'missing-items',
 'field-1456',
 'category-1',
 'missing-items',
 'field-1456',
 'ticket',
 'missing-items',
 'open-ticket',
 'category-1',
 'missing-items',
 'open-ticket',
 'field-1456',
 'category-1',
 'missing-items',
 'category-1',
 'missing-items',
 'field-1456',
 'missing-items',
 'field-1456',
 'category-1',
 'missing-items',
 'category-1',
 'ticket',
 'missing-items',
 'field-1456',
 'missing-items',
 'open-ticket',
 'field-1456',
 'missing-items',
 'open-ticket',
 'missing-items',
 'open-ticket',
 'field-1456',
 'category-1',
 'missing-items',
 'open-ticket',
 'ticket',
 'missing-items',
 'open-ticket',
 'category-1',
 'where-is-my-order',
 'field-1456',
 'category-1',
 'where-is-my-order',
 'open-ticket',
 'category-1',
 'where-is-my-order',
 'where-

In [7]:
# Get the unique tags
unique_tags = list(set(all_tags))
unique_tags

['add-item',
 'not-a-request',
 'open-ticket',
 'lost-package',
 'promocode-not-working',
 'exchange',
 'arrived-damaged',
 'adverse-effect',
 'order-confirmation-not-received',
 'discounts-questions',
 'wrong-item-delivered',
 'return-label',
 'remove-item',
 'change-delivery-address',
 'shipping-price',
 'where-is-my-order',
 'cancel-subscription',
 'field-1456',
 '?',
 'language',
 'how-to-return',
 'warranty-claim-status',
 'price-adjustment',
 'cancel-order',
 'order-confirmation-not-received,-return-questions-',
 'ticket',
 'missing-items',
 'warranty-policy-information',
 'category-1',
 'faulty-product',
 'change-delivery-date',
 'other',
 'change-items',
 'update-account-information',
 'donation-requests',
 'return-status']

In [8]:
# Create columns for all the unique tags, with an initial value 0
# Value 0 means that the message does not contain the specific tag
# Value 1 means that the message contains the specific tag
tickets_df[unique_tags] = 0
tickets_df.head()

Unnamed: 0,Description,tags,add-item,not-a-request,open-ticket,lost-package,promocode-not-working,exchange,arrived-damaged,adverse-effect,...,missing-items,warranty-policy-information,category-1,faulty-product,change-delivery-date,other,change-items,update-account-information,donation-requests,return-status
0,"- __EMAIL__ Hi , I have just ordered a pair of...","(category-1, missing-items, field-1456)",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,I am missing a pair of shoes from my order. Co...,"(missing-items, open-ticket)",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,I didn'tget a my order - __EMAIL__,"(category-1, missing-items)",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Hello, I ordered two __PRODUCTS_NAMES and one ...","(category-1, missing-items, open-ticket, field...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,My shipment never was delivered. The tracking ...,"(category-1, missing-items, open-ticket)",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# For every unique tag, populate its column if present in the message
for tag in unique_tags:
    tickets_df[tag] = tickets_df['tags'].apply(lambda tags_tuple: 1 if tag in tags_tuple else 0)
tickets_df.head()

Unnamed: 0,Description,tags,add-item,not-a-request,open-ticket,lost-package,promocode-not-working,exchange,arrived-damaged,adverse-effect,...,missing-items,warranty-policy-information,category-1,faulty-product,change-delivery-date,other,change-items,update-account-information,donation-requests,return-status
0,"- __EMAIL__ Hi , I have just ordered a pair of...","(category-1, missing-items, field-1456)",0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
1,I am missing a pair of shoes from my order. Co...,"(missing-items, open-ticket)",0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,I didn'tget a my order - __EMAIL__,"(category-1, missing-items)",0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
3,"Hello, I ordered two __PRODUCTS_NAMES and one ...","(category-1, missing-items, open-ticket, field...",0,0,1,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
4,My shipment never was delivered. The tracking ...,"(category-1, missing-items, open-ticket)",0,0,1,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0


In [10]:
# Drop tags column, as it is not necessary anymore
tickets_df = tickets_df.drop(columns='tags')
tickets_df.head()

Unnamed: 0,Description,add-item,not-a-request,open-ticket,lost-package,promocode-not-working,exchange,arrived-damaged,adverse-effect,order-confirmation-not-received,...,missing-items,warranty-policy-information,category-1,faulty-product,change-delivery-date,other,change-items,update-account-information,donation-requests,return-status
0,"- __EMAIL__ Hi , I have just ordered a pair of...",0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
1,I am missing a pair of shoes from my order. Co...,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,I didn'tget a my order - __EMAIL__,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
3,"Hello, I ordered two __PRODUCTS_NAMES and one ...",0,0,1,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
4,My shipment never was delivered. The tracking ...,0,0,1,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0


In [11]:
# Let's see how many messages we have for each tag
tags_count = tickets_df.drop(columns='Description').sum()
tags_count

add-item                                                3
not-a-request                                          27
open-ticket                                           700
lost-package                                            9
promocode-not-working                                 159
exchange                                              150
arrived-damaged                                         1
adverse-effect                                          1
order-confirmation-not-received                        86
discounts-questions                                   150
wrong-item-delivered                                   23
return-label                                           57
remove-item                                             2
change-delivery-address                                19
shipping-price                                         11
where-is-my-order                                     150
cancel-subscription                                    12
field-1456    

For most of these tag categories, I can just take a look and label them manually. \
For categories that have more than 30 messages, we will sample at most 10% from them (to guarantee randomness) and again check manually. If the labeling for each tag will be consistent, I will label all the messages containing that tag the same. \
For clashing tags, if this will come to happen, I will take a closer look at the combination of tags.

In [12]:
# In order to see the whole message, we need to set the pandas column width to unlimited
pd.set_option('display.max_colwidth', None)

In [13]:
for tag in unique_tags:
    if tickets_df[tag].sum() < 30:
        print(tag)
        print(tickets_df.loc[tickets_df[tag] == 1]["Description"])
        print("-"*20)

add-item
1348                                                                                                                                                                                                                                                                                                            \n\nSo i placed an order for 2 pairs and now, i get an email for a free backpack. Is there a way that i can get it added to my order? \n\nMy code is MYWELCOMEPACK\n\nOrder #__ORDER_NUMBER__
1349    Hello,I just completed my order for the __COMPANY__ running shoes ( order# __ORDER_NUMBER__). As soon as I put my payment information in, it completed the order before I could enter the cyber Monday promo code. Could you please include the tote bag with my order please. Would it also be possible to include the elastic non tie bands with my shoes please? I ordered shoes from you in the past, and the non tie laces were not included.Thank you very much, __NAME__Cyber Monday Tote gift
135

By analyzing the above messages we can conclude that:
1. Messages having the tag missing-items have the following distribution (by id): (2, 4, 5, 11, 12, 17) -> label 1, (0, 1, 3, 6, 7, 8, 9, 10, 13, 14, 15, 16, 18) -> label 0
2. The message tagged with language should be translate, and labeled as 1.
3. The message tagged with order-confirmation-not-received,-return-questions- can be labeled with 1.
4. All messages having the tag return-status can be labeled with 0.
5. All messages having the tag change-items can be labeled with 0.
6. All messages having the tag shipping-price can be labeled with 0.
7. Messages having the tag ? have the following distribution (by id): (594, 597, 598, 599, 600, 601, 602, 603, 605, 606) -> label 1, (595, 596, 604) -> label 0
8. All messages having the tag change-delivery-date can be labeled with 0.
9. All messages having the tag donation-requests can be labeled with 0.
10. All messages having the tag add-item can be labeled with 0.
11. All messages having the tag cancel-order can be labeled with 0.
12. All messages having the tag lost-package can be labeled with 0.
13. The message tagged with arrived-damaged can be labeled with 0.
14. All messages having the tag warranty-policy-information can be labeled with 0.
15. All messages having the tag wrong-item-delivered can be labeled with 0.
16. All messages having the tag change-delivery-address can be labeled with 0.
17. All messages having the tag cancel-subscription can be labeled with 0.
18. All messages having the tag not-a-request can be labeled with 0. Except for message with id 417 that should be translated and labeled with 1.
19. All messages having the tag update-account-information can be labeled with 0.
20. The message tagged with adverse-effect can be labeled with 0.
21. All messages having the tag remove-item can be labeled with 0.

What we can observe from the few tickets we have read is that in our preprocessing, we will need to check if a request is in another language, and translate it accordingly if so.

Now I can label the messages based on what I have written above. For this I will define a new column in our dataframe named
is_about_order_status. Messages that are related to the status of the order will have a 1 in this column, the rest will have this value set to 0. this will be our target variable on which we will base our classification.

Also, since we manually checked each of these items, we can say that we will not change their target variable. Therefore, I will sample only from the unchecked tickets.

In [14]:
# Define target variable and initialize it to 0 for all rows.
tickets_df["is_about_order_status"] = 0
tickets_df.head()

Unnamed: 0,Description,add-item,not-a-request,open-ticket,lost-package,promocode-not-working,exchange,arrived-damaged,adverse-effect,order-confirmation-not-received,...,warranty-policy-information,category-1,faulty-product,change-delivery-date,other,change-items,update-account-information,donation-requests,return-status,is_about_order_status
0,"- __EMAIL__ Hi , I have just ordered a pair of __PRODUCT_NAME__ and put the relevant code for free __COMPANY__ bag as new customer . I have received trainers but no bag . Will this be sent separately? \n Seems to be an inefficient system, or was this an oversight?\n Kind regards\n __NAME__",0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,I am missing a pair of shoes from my order. Could someone please call me asap. I can't wait any longer. We leave this Sunday. Thanks. __NAME__,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,I didn'tget a my order - __EMAIL__,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,"Hello, I ordered two __PRODUCTS_NAMES and one __PRODUCTS_NAMES_, but only received the two __PRODUCTS_NAMES__ Could you please let me know why the __PRODUCTS_NAMES__ were not sent and when am i likely to received them. My order number is: __ORDER_NUMBER__\n \n\n Regards,\n __NAME__",0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,My shipment never was delivered. The tracking says it was delivered at 10:23 on Friday but was never received. We have a front desk and should have been received.\n \n\n Please let me know what I need to do to proceed. I have a race in a week,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [16]:
# Change target variable to 1 based on our findings above (stated in the MD cell)
tickets_df.loc[[2, 4, 5, 11, 12, 17, 594, 597, 598, 599, 600, 601, 602, 603, 605, 606, 417], "is_about_order_status"] = 1
tickets_df.loc[[2, 4, 5, 11, 12, 17, 594, 597, 598, 599, 600, 601, 602, 603, 605, 606, 417], "is_about_order_status"]

2      1
4      1
5      1
11     1
12     1
17     1
594    1
597    1
598    1
599    1
600    1
601    1
602    1
603    1
605    1
606    1
417    1
Name: is_about_order_status, dtype: int64

In [19]:
# Change target variable to 1 based on our findings above (stated in the MD cell)
tickets_df.loc[tickets_df["order-confirmation-not-received,-return-questions-"] == 1, "is_about_order_status"] = 1
tickets_df.loc[tickets_df["language"] == 1, "is_about_order_status"] = 1
print(tickets_df.loc[tickets_df["order-confirmation-not-received,-return-questions-"] == 1]["is_about_order_status"], 
      tickets_df.loc[tickets_df["language"] == 1]["is_about_order_status"])

593    1
Name: is_about_order_status, dtype: int64 331    1
Name: is_about_order_status, dtype: int64


In [39]:
# Get uncheked tickets
unchecked_tickets_df = tickets_df
less_frequent_tags = tags_count.loc[tags_count < 30].index.values
for tag in less_frequent_tags:
    # Retain only tickets that have not been checked
    unchecked_tickets_df = unchecked_tickets_df.loc[unchecked_tickets_df[tag] == 0]
print(unchecked_tickets_df.shape)

(1202, 38)


For all the tags that have tag-count > 30, I will sample 10% and fill the is_about_order_status variable accordingly.
If for some samples the target variable is not consistent, I will look more into the tag and decide how to continue.

In [20]:
# Get a list with all tags that have more than 30 messages
frequent_tags = tags_count[tags_count >= 30].index.values
frequent_tags

array(['open-ticket', 'promocode-not-working', 'exchange',
       'order-confirmation-not-received', 'discounts-questions',
       'return-label', 'where-is-my-order', 'field-1456', 'how-to-return',
       'warranty-claim-status', 'price-adjustment', 'ticket',
       'category-1', 'faulty-product', 'other'], dtype=object)

In [33]:
# Some of the tags will have more than 50 items, even if we just sample 10% of the population
# In order to not waste time, I will look at a maximum of 40 items per tag and decide what to do with them.
# Therefore, I will set the number of rows to display to a max of 40.
pd.set_option('display.max_rows', 30)
pd.set_option('display.min_rows', None)

In [34]:
for tag in frequent_tags:
    tag_sample = unchecked_tickets_df.loc[unchecked_tickets_df[tag] == 1].sample(frac=0.1)
    print(tag)
    print(tag_sample["Description"])
    print("-"*20)

open-ticket
1296                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

Based on the samples I have randomly drawn, I can conclude the following:

1. Messages tagged with open-ticket tag contain both ticket types, so no conclusion can be drawn from them.
2. Messages tagged with promocode-not-working tag can be labeled with 0.
3. Messages tagged with exchange tag can be labeled with 0.
3. Messages tagged with order-confirmation-not-received tag can be labeled with 0.
4. Messages tagged with discounts-questions tag can be labeled with 0.
5. Messages tagged with return-label tag can be labeled with 0.
6. Messages tagged with where-is-my-order tag can be labeled with 1.
7. Messages tagged with field-1456 tag contain both ticket types, so no conclusion can be drawn from them.
8. Messages tagged with how-to-return tag can be labeled with 0.
9. Messages tagged with warranty-claim-status tag can be labeled with 0.
10. Messages tagged with price-adjustment tag can be labeled with 0.
11. Messages tagged with ticket tag contain both ticket types, so no conclusion can be drawn from them.
12. Messages tagged with category-1 tag contain both ticket types, so no conclusion can be drawn from them.
13. Messages tagged with faulty-product tag can be labeled with 0.
14. Messages tagged with other tag contain both ticket types, so no conclusion can be drawn from them.

In [35]:
# Label tickets having 'where-is-my-order' tag correctly
tickets_df.loc[tickets_df['where-is-my-order'] == 1, "is_about_order_status"] = 1
tickets_df.loc[tickets_df['where-is-my-order'] == 1]["is_about_order_status"].value_counts()

1    150
Name: is_about_order_status, dtype: int64

Now I want to see how many unchecked tickets we have remaining.

In [40]:
# Get uncheked tickets
checked_tags = list(set(frequent_tags) - {'open-ticket', 'field-1456', 'ticket', 'category-1', 'other'})
for tag in checked_tags:
    # Retain only tickets that have not been checked
    unchecked_tickets_df = unchecked_tickets_df.loc[unchecked_tickets_df[tag] == 0]
print(unchecked_tickets_df.shape)

(127, 38)


With only 127 tickets remaining, we can check them by hand and classify them. Then we can rest assured that our initial dataset is correctly labeled.
I will check for them in 3 batches of 43.

In [42]:
# Again set pandas options so that we can see all rows.
pd.set_option('display.max_rows', 45)

In [44]:
# First batch
print(unchecked_tickets_df.iloc[0:43]["Description"])

466                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          Hi. What is the cost of the delivery to __PLACE__?\n \n\n Thanks!
467                                                                                                                                      

Ids: (466-470, 472-476, 481-483, 487, 488, 490, 491, 496, 498-500, 502-507)
should be labeled with 0

Ids: (471, 477-480, 484-486, 489, 492-495, 497, 501)
should be labeled with 1

In [46]:
# Second batch
print(unchecked_tickets_df.iloc[43:86]["Description"])

509                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     

Ids: (509-551)
should be labeled with 0

Ids: ()
should be labeled with 1

In [48]:
# Third batch
print(unchecked_tickets_df.iloc[86:]["Description"])

552                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     

Ids: (552-592)
should be labeled with 0

Ids: ()
should be labeled with 1

In [49]:
# Label the rows according to the findings
(471, 477-480, 484-486, 489, 492-495, 497, 501)
tickets_df.loc[471, "is_about_order_status"] = 1
tickets_df.loc[477:480, "is_about_order_status"] = 1
tickets_df.loc[484:486, "is_about_order_status"] = 1
tickets_df.loc[489, "is_about_order_status"] = 1
tickets_df.loc[492:495, "is_about_order_status"] = 1
tickets_df.loc[497, "is_about_order_status"] = 1
tickets_df.loc[501, "is_about_order_status"] = 1

In [50]:
tickets_df.loc[tickets_df["is_about_order_status"] == 1].shape

(184, 38)

In [51]:
# We have 184 rows that are of class "where-is-my-order".
# We can now keep only the relevant columns: the message and the target
final_df = tickets_df[["Description", "is_about_order_status"]]
final_df.head()

Unnamed: 0,Description,is_about_order_status
0,"- __EMAIL__ Hi , I have just ordered a pair of __PRODUCT_NAME__ and put the relevant code for free __COMPANY__ bag as new customer . I have received trainers but no bag . Will this be sent separately? \n Seems to be an inefficient system, or was this an oversight?\n Kind regards\n __NAME__",0
1,I am missing a pair of shoes from my order. Could someone please call me asap. I can't wait any longer. We leave this Sunday. Thanks. __NAME__,0
2,I didn'tget a my order - __EMAIL__,1
3,"Hello, I ordered two __PRODUCTS_NAMES and one __PRODUCTS_NAMES_, but only received the two __PRODUCTS_NAMES__ Could you please let me know why the __PRODUCTS_NAMES__ were not sent and when am i likely to received them. My order number is: __ORDER_NUMBER__\n \n\n Regards,\n __NAME__",0
4,My shipment never was delivered. The tracking says it was delivered at 10:23 on Friday but was never received. We have a front desk and should have been received.\n \n\n Please let me know what I need to do to proceed. I have a race in a week,1


In [52]:
# Save the dataframe into a file. 
# We will use it in another notebook to preprocess the actual message now that we have the target label.
final_df.to_csv("../data/labeled_tickets.csv")