# Cleaning Public Support data

## Importing necessary libraries

In [47]:
pip install pandas matplotlib seaborn wordcloud


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime,date, timedelta

## Loading the data

In [49]:
slack = pd.read_csv('../sources/support-channels.csv')


## Discover

In [50]:
print('Shape of slack dataframe before cleaning:', slack.shape)

Shape of slack dataframe before cleaning: (481, 14)


In [51]:
slack.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481 entries, 0 to 480
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Channel_ID        481 non-null    object
 1   Channel_Slug      481 non-null    object
 2   Timestamp         481 non-null    object
 3   Timestamp_Thread  368 non-null    object
 4   User_ID           481 non-null    object
 5   Full_Name         470 non-null    object
 6   Email             481 non-null    object
 7   Permalink         481 non-null    object
 8   Text              481 non-null    object
 9   Text_raw          481 non-null    object
 10  Slack_username    481 non-null    object
 11  Team_ID           481 non-null    object
 12  Team_Name         481 non-null    object
 13  Is_Bot            481 non-null    bool  
dtypes: bool(1), object(13)
memory usage: 49.4+ KB


**Creating 2 new columns**

In [52]:
slack['Is_a_question'] = np.where(slack['Timestamp_Thread'].isnull(), 1, 0)

In [53]:
support_agents = ['1','5301']

slack['Is_agent'] = np.where(slack['User_ID'].isin(support_agents), 1, 0)

**Encoding necessary columns**

In [54]:
slack['Is_Bot'] = np.where(slack['Is_Bot'] == True, 1, 0)

**Converting timestamp columns**

In [55]:
slack['Datetime'] = pd.to_datetime(slack['Timestamp'])
slack['Datetime_Thread'] = pd.to_datetime(slack['Timestamp_Thread'])

**Creating 2 dataframes: questions and answers**

In [56]:
Q_df = slack[slack['Is_a_question'] == 1]
A_df = slack[slack['Is_a_question'] == 0]

**Working on Answers dataframe**

In [57]:
answers = A_df.groupby(['User_ID','Datetime'])[['Text']]

In [58]:
df_answers = pd.DataFrame(answers.sum().reset_index())

In [59]:
df_answers.head()

Unnamed: 0,User_ID,Datetime,Text
0,U01KGAER1TM,2022-11-04 17:02:51,No se quiere usar un tercero para las fotos
1,U01KGAER1TM,2022-11-04 17:04:57,digamos que son imagenes de usuarios en donde ...
2,U01KGAER1TM,2022-11-04 17:05:02,comentarios y likes
3,U01KGAER1TM,2022-11-04 17:06:30,y la base de datos no solo contiene las imagen...
4,U01KGAER1TM,2022-11-04 17:26:59,https://isn365.com/


In [60]:
df_answers['Diff_in_Seconds'] = (df_answers.sort_values('Datetime').groupby('User_ID').Datetime.diff())

In [61]:
df_answers['Diff_in_Seconds'] = df_answers['Diff_in_Seconds'].fillna(pd.Timedelta(seconds=0))

In [62]:
df_answers['Diff_in_Seconds'] = df_answers['Diff_in_Seconds']/np.timedelta64(1,'s')

In [63]:
df_answers['diff_abs'] = df_answers.Diff_in_Seconds.abs()

In [64]:
df_answers['same_author'] = df_answers['User_ID'].ne(df_answers['User_ID'].shift().bfill()).astype(int)

In [65]:
def create_AnswerId(df):
    for group in df.groupby(['User_ID']):
        df['messageId'] = df.diff_abs.gt(300).cumsum() + 1 + df.same_author.cumsum()
    return df
create_AnswerId(df_answers)

  for group in df.groupby(['User_ID']):


Unnamed: 0,User_ID,Datetime,Text,Diff_in_Seconds,diff_abs,same_author,messageId
0,U01KGAER1TM,2022-11-04 17:02:51,No se quiere usar un tercero para las fotos,0.0,0.0,0,1
1,U01KGAER1TM,2022-11-04 17:04:57,digamos que son imagenes de usuarios en donde ...,126.0,126.0,0,1
2,U01KGAER1TM,2022-11-04 17:05:02,comentarios y likes,5.0,5.0,0,1
3,U01KGAER1TM,2022-11-04 17:06:30,y la base de datos no solo contiene las imagen...,88.0,88.0,0,1
4,U01KGAER1TM,2022-11-04 17:26:59,https://isn365.com/,1229.0,1229.0,0,2
...,...,...,...,...,...,...,...
363,UU409472Q,2022-11-11 00:32:33,No te preocupes no es algo que afecte,632178.0,632178.0,0,190
364,UU409472Q,2022-11-11 01:48:09,https://www.notion.so/4geeksacademy/How-to-del...,4536.0,4536.0,0,191
365,UU409472Q,2022-11-11 14:33:32,"Meri , sigue estos pasos https://www.notion.so...",45923.0,45923.0,0,192
366,UU409472Q,2022-11-11 15:27:50,:anguished: intenta ir al dashboard a ver,3258.0,3258.0,0,193


In [66]:
df_answers.shape

(368, 7)

In [67]:
A_df.shape

(368, 18)

**Questions dataframe**

In [68]:
questions = Q_df.groupby(['User_ID','Datetime'])[['Text']]

In [69]:
df_questions = pd.DataFrame(questions.sum().reset_index())

In [70]:
df_questions.head()

Unnamed: 0,User_ID,Datetime,Text
0,U01KGAER1TM,2022-11-04 16:14:32,"Como estan, tengo un problema, ocupo subir una..."
1,U01SJ480RBR,2022-11-07 16:59:39,Hello
2,U01SM5J4MMG,2022-11-07 21:15:26,"Buenas tardes, intenta poniendo solo git push"
3,U02N1P8CV6W,2022-10-31 22:55:36,"Hola chicos, he estado intentando llevar a pro..."
4,U02NE11UHNC,2022-11-03 22:28:10,have someone imported custom fonts (no google ...


In [71]:
df_questions.shape

(112, 3)

In [72]:
df_questions['Diff_in_Seconds'] = (df_questions.sort_values('Datetime').groupby('User_ID').Datetime.diff())

In [73]:
df_questions['Diff_in_Seconds'] = df_questions['Diff_in_Seconds'].fillna(pd.Timedelta(seconds=0))

In [74]:
df_questions['Diff_in_Seconds']=df_questions['Diff_in_Seconds']/np.timedelta64(1,'s')

In [75]:
df_questions['diff_abs'] = df_questions.Diff_in_Seconds.abs()

In [76]:
df_questions['same_author'] = df_questions['User_ID'].ne(df_questions['User_ID'].shift().bfill()).astype(int)

In [77]:
def create_QuestionId(df):
    for group in df.groupby(['User_ID']):
        df['messageId'] = df['diff_abs'].gt(300).cumsum() + 1 + df.same_author.cumsum()
    return df

create_QuestionId(df_questions)

  for group in df.groupby(['User_ID']):


Unnamed: 0,User_ID,Datetime,Text,Diff_in_Seconds,diff_abs,same_author,messageId
0,U01KGAER1TM,2022-11-04 16:14:32,"Como estan, tengo un problema, ocupo subir una...",0.0,0.0,0,1
1,U01SJ480RBR,2022-11-07 16:59:39,Hello,0.0,0.0,1,2
2,U01SM5J4MMG,2022-11-07 21:15:26,"Buenas tardes, intenta poniendo solo git push",0.0,0.0,1,3
3,U02N1P8CV6W,2022-10-31 22:55:36,"Hola chicos, he estado intentando llevar a pro...",0.0,0.0,1,4
4,U02NE11UHNC,2022-11-03 22:28:10,have someone imported custom fonts (no google ...,0.0,0.0,1,5
...,...,...,...,...,...,...,...
107,U04A6KV066M,2022-11-08 20:08:45,"Hi all,",0.0,0.0,1,89
108,U04A6KV066M,2022-11-08 20:13:24,Hi all. I am trying to split my view to where ...,279.0,279.0,0,89
109,U04A6KV066M,2022-11-08 21:58:27,Should we be doing anything with “commit” and ...,6303.0,6303.0,0,90
110,U6MR8LG4Q,2022-11-05 03:11:30,Hey @betomasia12 No need to make the pictures ...,0.0,0.0,1,91


In [78]:
df_questions.shape

(112, 7)

In [79]:
df_questions.head(5)

Unnamed: 0,User_ID,Datetime,Text,Diff_in_Seconds,diff_abs,same_author,messageId
0,U01KGAER1TM,2022-11-04 16:14:32,"Como estan, tengo un problema, ocupo subir una...",0.0,0.0,0,1
1,U01SJ480RBR,2022-11-07 16:59:39,Hello,0.0,0.0,1,2
2,U01SM5J4MMG,2022-11-07 21:15:26,"Buenas tardes, intenta poniendo solo git push",0.0,0.0,1,3
3,U02N1P8CV6W,2022-10-31 22:55:36,"Hola chicos, he estado intentando llevar a pro...",0.0,0.0,1,4
4,U02NE11UHNC,2022-11-03 22:28:10,have someone imported custom fonts (no google ...,0.0,0.0,1,5


**Merge each dataframe to its previous columns**

In [80]:
df_answers = df_answers.merge(A_df, how = 'left', left_on = ['User_ID', 'Datetime', 'Text'],
    right_on = ['User_ID', 'Datetime', 'Text']).drop(['Diff_in_Seconds','diff_abs','same_author','Text_raw'], axis=1)

In [81]:
df_answers['response_time'] = df_answers['Datetime'] - df_answers['Datetime_Thread']

In [82]:
df_answers.shape

(368, 19)

In [83]:
df_questions = df_questions.merge(Q_df, how = 'left', left_on = ['User_ID', 'Datetime', 'Text'],
    right_on = ['User_ID', 'Datetime', 'Text']).drop(['Diff_in_Seconds','diff_abs','same_author','Text_raw'], axis=1)

In [84]:
df_questions.shape

(112, 18)

**Merge text and timestamps in rows that have the same messageId**

In [85]:
df_answers['Text'] = df_answers.groupby(['messageId'])['Text'].transform(lambda x : ' '.join(x))
df_questions['Text'] = df_questions.groupby(['messageId'])['Text'].transform(lambda x : ' '.join(x))

In [86]:
df_questions.dropna(axis=1, how='all', inplace=True)
df_questions['Timestamp'] = df_questions.groupby(['messageId'])['Timestamp'].transform(lambda x : ','.join(map(str, x)))

In [87]:
#rename to ids in both dataframes
df_answers.rename(columns={"Timestamp": "Timestamp_id", "Timestamp_Thread": "Timestamp_Question"}, inplace=True)
df_questions.rename(columns={"Timestamp": "Timestamp_id"}, inplace=True)

In [88]:
df_answers = df_answers.drop_duplicates(subset=["Text"],keep='first')
df_questions = df_questions.drop_duplicates(subset=["Text","Timestamp_id"],keep='first')

In [89]:
#create a list from df3 question_id column
question_ids_list = df_questions['Timestamp_id'].tolist()

In [90]:
def id_autocompletion(search):
    for id in question_ids_list:
        if search in id:
            return id
    
    return None

In [91]:
df_answers['Timestamp_Question'] =  df_answers['Timestamp_Question'].apply(id_autocompletion)

In [92]:
#verifying it worked
df_answers[df_answers['Timestamp_Question'].str.len()>20][['User_ID','Timestamp_id','Timestamp_Question']]

Unnamed: 0,User_ID,Timestamp_id,Timestamp_Question
37,U02TNKBLUGY,11/9/2022 15:42:58,"11/9/2022 15:38:15,11/9/2022 15:41:12"
41,U02VD5L7J1W,10/29/2022 23:57:41,"10/29/2022 23:52:40,10/29/2022 23:53:16,10/29/..."
65,U035T2JTPC2,11/2/2022 13:51:38,"11/2/2022 3:42:10,11/2/2022 3:45:20"
80,U035T2JTPC2,11/3/2022 23:33:32,"11/3/2022 22:28:10,11/3/2022 22:29:25"
81,U035T2JTPC2,11/4/2022 3:25:28,"11/3/2022 22:28:10,11/3/2022 22:29:25"
100,U035YA2SV4N,11/8/2022 19:58:51,"11/7/2022 16:59:27,11/7/2022 17:01:38"
101,U037GJJHKNZ,11/11/2022 2:50:09,"11/10/2022 22:34:28,11/10/2022 22:34:56"
166,U03T95TLQMC,11/8/2022 21:06:03,"11/8/2022 21:05:25,11/8/2022 21:05:32,11/8/202..."
194,U041K795UFN,11/8/2022 20:36:10,"11/8/2022 20:08:45,11/8/2022 20:13:24"
195,U041K795UFN,11/9/2022 15:18:09,"11/8/2022 20:08:45,11/8/2022 20:13:24"


In [None]:
#Saving to csv

df_questions.to_csv('../output/questions.csv')

df_answers.to_csv('../output/answers.csv')