In [4]:
import pandas as pd
import numpy as np

In [5]:
df3 = pd.read_csv("data/dataset3.csv")
print(df3.shape)
df3.head()


(6429, 11)


Unnamed: 0,ticket_id,customer_id,created_at,channel,priority,issue_category,ticket_text,sentiment,first_response_hours,resolution_hours,resolved
0,200001,C100000,2024-05-12T23:17,Chat,Medium,product_performance,Dashboard loads very slowly during peak hours.,0.44,0.11,14.88,1
1,200002,C100001,2024-06-28T18:27,Email,Low,product_performance,Dashboard loads very slowly during peak hours.,-0.5,8.12,9.37,1
2,200003,C100002,2024-11-25T16:38,Email,Medium,product_usability,The new dashboard is confusing for my staff.,-0.5,2.97,31.65,1
3,200004,C100003,2024-11-02T14:37,Phone,High,billing_admin,I need to change our billing email and can't f...,-0.11,0.1,12.49,1
4,200005,C100004,2024-10-17T22:27,Phone,Medium,product_performance,Inventory sync takes too long to complete.,0.2,5.19,35.67,1


In [8]:
# checking missingness
missing_counts = df3.isna().sum().sort_values(ascending=False)

print(missing_counts.head(15))

ticket_id               0
customer_id             0
created_at              0
channel                 0
priority                0
issue_category          0
ticket_text             0
sentiment               0
first_response_hours    0
resolution_hours        0
resolved                0
dtype: int64


In [9]:
print("Duplicate rows:", int(df3.duplicated().sum()))
print("Duplicate ticket_id:", int(df3["ticket_id"].duplicated().sum()))

# Optional: suspicious duplicates by same customer + same timestamp + same text
print("Dupes by (customer_id, created_at, ticket_text):",
      int(df3.duplicated(subset=["customer_id","created_at","ticket_text"]).sum()))


Duplicate rows: 0
Duplicate ticket_id: 0
Dupes by (customer_id, created_at, ticket_text): 0


In [11]:
df3["created_dt"] = pd.to_datetime(df3["created_at"], errors="coerce")
print("Invalid created_at timestamps:", int(df3["created_dt"].isna().sum()))


Invalid created_at timestamps: 0


In [15]:
valid_channels = {"Chat","Email","Phone","In-App"}
valid_priorities = {"Low","Medium","High","Urgent"}
valid_issue_categories = {
    "product_performance","product_usability","billing_admin","sales_expectation"
}

df3["channel_invalid"] = df3["channel"].notna() & ~df3["channel"].isin(valid_channels)
df3["priority_invalid"] = df3["priority"].notna() & ~df3["priority"].isin(valid_priorities)
df3["issue_category_invalid"] = df3["issue_category"].notna() & ~df3["issue_category"].isin(valid_issue_categories)

print("Invalid channel:", int(df3["channel_invalid"].sum()))
print("Invalid priority:", int(df3["priority_invalid"].sum()))
print("Invalid issue_category:", int(df3["issue_category_invalid"].sum()))


Invalid channel: 0
Invalid priority: 0
Invalid issue_category: 0


In [17]:
for c in ["sentiment","first_response_hours","resolution_hours"]:
    df3[c] = pd.to_numeric(df3[c], errors="coerce")

# Basic validity checks
df3["first_response_negative"] = df3["first_response_hours"].notna() & (df3["first_response_hours"] < 0)
df3["resolution_negative"] = df3["resolution_hours"].notna() & (df3["resolution_hours"] < 0)

# Sentiment is typically expected in [-1, 1] (flag anything outside)
df3["sentiment_out_of_range"] = df3["sentiment"].notna() & ~df3["sentiment"].between(-1, 1)

print("first_response_hours negative:", int(df3["first_response_negative"].sum()))
print("resolution_hours negative:", int(df3["resolution_negative"].sum()))
print("sentiment out of [-1,1]:", int(df3["sentiment_out_of_range"].sum()))

df3.loc[df3["sentiment_out_of_range"], ["ticket_id","sentiment","ticket_text"]].head(10)


first_response_hours negative: 0
resolution_hours negative: 0
sentiment out of [-1,1]: 7


Unnamed: 0,ticket_id,sentiment,ticket_text
1501,201502,-1.03,Inventory sync takes too long to complete.
1727,201728,-1.06,Inventory sync takes too long to complete.
3191,203192,1.03,We expected multi-warehouse support out of the...
3622,203623,1.02,We were charged for extra users we don't have.
3640,203641,-1.23,Dashboard loads very slowly during peak hours.
4483,204484,-1.01,Pricing discussed in the demo does not match o...
6160,206161,-1.01,Sales promised a custom report that doesn't ex...


In [18]:
# resolved should be binary 0/1
df3["resolved"] = pd.to_numeric(df3["resolved"], errors="coerce")
df3["resolved_invalid"] = df3["resolved"].notna() & ~df3["resolved"].isin([0,1])
print("resolved invalid:", int(df3["resolved_invalid"].sum()))
df3.loc[df3["resolved_invalid"], ["ticket_id","resolved"]].head(10)

# first response should not exceed resolution time (when both exist)
df3["resp_gt_resolution"] = (
    df3["first_response_hours"].notna()
    & df3["resolution_hours"].notna()
    & (df3["first_response_hours"] > df3["resolution_hours"])
)
print("first_response_hours > resolution_hours:", int(df3["resp_gt_resolution"].sum()))


resolved invalid: 0
first_response_hours > resolution_hours: 0


In [19]:
df3["resolution_hours_closed"] = np.where(df3["resolved"] == 1, df3["resolution_hours"], np.nan)

# convenience date fields
df3["created_date"] = df3["created_dt"].dt.date
df3["created_month"] = df3["created_dt"].dt.to_period("M").astype(str)


In [23]:
drop_cols = [
    "channel_invalid","priority_invalid","issue_category_invalid",
    "first_response_negative","resolution_negative","sentiment_out_of_range",
    "resolved_invalid","resp_gt_resolution"
]
df3 = df3.drop(columns=[c for c in drop_cols if c in df3.columns]).copy()

df3[df3["resolved"] == 0]


Unnamed: 0,ticket_id,customer_id,created_at,channel,priority,issue_category,ticket_text,sentiment,first_response_hours,resolution_hours,resolved,created_dt,resolution_hours_closed,created_date,created_month
13,200014,C100006,2024-08-21T01:51,Phone,Medium,product_performance,Inventory sync takes too long to complete.,0.06,7.18,27.32,0,2024-08-21 01:51:00,,2024-08-21,2024-08
45,200046,C100021,2024-12-28T03:56,Chat,Medium,product_usability,Search results are not clearly sorted.,0.24,18.44,20.94,0,2024-12-28 03:56:00,,2024-12-28,2024-12
71,200072,C100042,2024-04-08T13:04,Phone,Medium,product_usability,Bulk upload flow is not intuitive.,0.17,5.58,11.66,0,2024-04-08 13:04:00,,2024-04-08,2024-04
80,200081,C100047,2024-07-09T12:15,Chat,Medium,billing_admin,VAT appears to be calculated incorrectly.,-0.11,2.44,14.32,0,2024-07-09 12:15:00,,2024-07-09,2024-07
84,200085,C100047,2024-05-27T23:55,In-App,Medium,billing_admin,I need to change our billing email and can't f...,-0.53,2.54,9.71,0,2024-05-27 23:55:00,,2024-05-27,2024-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6341,206342,C102968,2024-03-15T07:15,In-App,Low,product_performance,The reporting page keeps timing out.,-0.48,1.33,2.37,0,2024-03-15 07:15:00,,2024-03-15,2024-03
6374,206375,C102982,2024-05-02T04:36,In-App,Low,sales_expectation,Pricing discussed in the demo does not match o...,-0.01,1.45,9.96,0,2024-05-02 04:36:00,,2024-05-02,2024-05
6386,206387,C102986,2024-05-18T00:08,In-App,High,product_performance,Dashboard loads very slowly during peak hours.,-0.14,17.62,23.01,0,2024-05-18 00:08:00,,2024-05-18,2024-05
6390,206391,C102988,2024-02-14T09:52,Phone,Medium,billing_admin,I need to change our billing email and can't f...,0.38,0.71,13.43,0,2024-02-14 09:52:00,,2024-02-14,2024-02
