# Cleaning Public Support data

## Importing necessary libraries

In [1]:
pip install pandas matplotlib seaborn wordcloud

Collecting matplotlib
  Downloading matplotlib-3.6.2-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (9.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m79.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hCollecting seaborn
  Downloading seaborn-0.12.2-py3-none-any.whl (293 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m293.3/293.3 kB[0m [31m85.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting wordcloud
  Downloading wordcloud-1.8.2.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (458 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m459.0/459.0 kB[0m [31m100.3 MB/s[0m eta [36m0:00:00[0m
Collecting contourpy>=1.0.1
  Downloading contourpy-1.0.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.0/296.0 kB[0m [31m58.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cycler>=0.10
  Downloading cycler-0

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime,date, timedelta

## Loading the data

In [4]:
slack = pd.read_csv('../sources/support_channels.csv')


## Discover

In [5]:
print('Shape of slack dataframe before cleaning:', slack.shape)

Shape of slack dataframe before cleaning: (500, 14)


In [6]:
slack.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Channel_ID        500 non-null    object
 1   Channel_Slug      500 non-null    object
 2   Timestamp         500 non-null    object
 3   Timestamp_Thread  337 non-null    object
 4   User_ID           500 non-null    object
 5   Full_Name         498 non-null    object
 6   Email             500 non-null    object
 7   Permalink         500 non-null    object
 8   Text              500 non-null    object
 9   Text_raw          491 non-null    object
 10  Slack_username    500 non-null    object
 11  Team_ID           500 non-null    object
 12  Team_Name         500 non-null    object
 13  Is_Bot            500 non-null    bool  
dtypes: bool(1), object(13)
memory usage: 51.4+ KB


**Creating 2 new columns**

In [7]:
slack['Is_a_question'] = np.where(slack['Timestamp_Thread'].isnull(), 1, 0)

In [8]:
support_agent_ids = ['1','5301']
support_agent_name = ['Alejandro Sanchez','Tomas Gonzalez']

slack['Is_agent'] = np.where(slack['User_ID'].isin(support_agent_ids) | slack['Full_Name'].isin(support_agent_name), 1, 0)

**Encoding necessary columns**

In [9]:
slack['Is_Bot'] = np.where(slack['Is_Bot'] == True, 1, 0)

**Converting timestamp columns**

In [10]:
slack['Datetime'] = pd.to_datetime(slack['Timestamp'])
slack['Datetime_Thread'] = pd.to_datetime(slack['Timestamp_Thread'])

**Creating 2 dataframes: questions and answers**

In [11]:
Q_df = slack[slack['Is_a_question'] == 1]
A_df = slack[slack['Is_a_question'] == 0]

**Working on Answers dataframe**

In [12]:
answers = A_df.groupby(['User_ID','Datetime'])[['Text']]

In [13]:
df_answers = pd.DataFrame(answers.sum().reset_index())

In [14]:
df_answers.head()

Unnamed: 0,User_ID,Datetime,Text
0,U015XDM7KRA,2022-11-25 14:32:52,no te sirve redux o el context api ? por que f...
1,U01GE2A7HV4,2022-12-09 22:13:54,The statement indicates that you must give val...
2,U01GE2A7HV4,2022-12-09 22:19:48,Hint: In line 18 you have the element that you...
3,U01PE4CNXN1,2022-11-30 11:33:40,"Yo te recomiendo netifly, es muy fácil de usar"
4,U01PE4CNXN1,2022-11-30 11:39:41,"Creo que sí, pero ahora mismo me haces dudar..."


In [15]:
df_answers['Diff_in_Seconds'] = (df_answers.sort_values('Datetime').groupby('User_ID').Datetime.diff())

In [16]:
df_answers['Diff_in_Seconds'] = df_answers['Diff_in_Seconds'].fillna(pd.Timedelta(seconds=0))

In [17]:
df_answers['Diff_in_Seconds'] = df_answers['Diff_in_Seconds']/np.timedelta64(1,'s')

In [18]:
df_answers['diff_abs'] = df_answers.Diff_in_Seconds.abs()

In [19]:
df_answers['same_author'] = df_answers['User_ID'].ne(df_answers['User_ID'].shift().bfill()).astype(int)

In [20]:
def create_AnswerId(df):
    for group in df.groupby(['User_ID']):
        df['messageId'] = df.diff_abs.gt(300).cumsum() + 1 + df.same_author.cumsum()
    return df
create_AnswerId(df_answers)

  for group in df.groupby(['User_ID']):


Unnamed: 0,User_ID,Datetime,Text,Diff_in_Seconds,diff_abs,same_author,messageId
0,U015XDM7KRA,2022-11-25 14:32:52,no te sirve redux o el context api ? por que f...,0.0,0.0,0,1
1,U01GE2A7HV4,2022-12-09 22:13:54,The statement indicates that you must give val...,0.0,0.0,1,2
2,U01GE2A7HV4,2022-12-09 22:19:48,Hint: In line 18 you have the element that you...,354.0,354.0,0,3
3,U01PE4CNXN1,2022-11-30 11:33:40,"Yo te recomiendo netifly, es muy fácil de usar",0.0,0.0,1,4
4,U01PE4CNXN1,2022-11-30 11:39:41,"Creo que sí, pero ahora mismo me haces dudar...",361.0,361.0,0,5
...,...,...,...,...,...,...,...
332,UPRFXC3QE,2022-12-06 21:10:39,There should be a free version. Check the othe...,1133.0,1133.0,0,234
333,UPRFXC3QE,2022-12-07 16:28:05,I dont know fly that well. You would need to r...,69446.0,69446.0,0,235
334,UPRFXC3QE,2022-12-09 13:47:04,Hi. So the error is showing you do not have t...,163139.0,163139.0,0,236
335,UU409472Q,2022-11-02 11:33:54,"hay dos opciones\ne.key o e.keycode , en ambas...",0.0,0.0,1,237


In [21]:
df_answers.shape

(337, 7)

In [22]:
A_df.shape

(337, 18)

**Questions dataframe**

In [26]:
questions = Q_df.groupby(['User_ID','Datetime'])[['Text']]

In [27]:
df_questions = pd.DataFrame(questions.sum().reset_index())

In [28]:
df_questions.head()

Unnamed: 0,User_ID,Datetime,Text
0,U01J1EA8JN7,2022-12-07 14:42:41,Gente alguien sabe como se le puede agregar a ...
1,U01SJ480RBR,2022-11-07 16:59:39,Hello
2,U01SM5J4MMG,2022-11-07 21:15:26,"Buenas tardes, intenta poniendo solo git push"
3,U026XFJHQPK,2022-12-14 13:34:18,"Hello, good afternoon for the people in europe..."
4,U02G5B470B1,2022-11-15 19:07:30,"Hello, where do I found the webpacks options a..."


In [29]:
df_questions.shape

(162, 3)

In [30]:
df_questions['Diff_in_Seconds'] = (df_questions.sort_values('Datetime').groupby('User_ID').Datetime.diff())

In [31]:
df_questions['Diff_in_Seconds'] = df_questions['Diff_in_Seconds'].fillna(pd.Timedelta(seconds=0))

In [32]:
df_questions['Diff_in_Seconds']=df_questions['Diff_in_Seconds']/np.timedelta64(1,'s')

In [33]:
df_questions['diff_abs'] = df_questions.Diff_in_Seconds.abs()

In [34]:
df_questions['same_author'] = df_questions['User_ID'].ne(df_questions['User_ID'].shift().bfill()).astype(int)

In [35]:
def create_QuestionId(df):
    for group in df.groupby(['User_ID']):
        df['messageId'] = df['diff_abs'].gt(300).cumsum() + 1 + df.same_author.cumsum()
    return df

create_QuestionId(df_questions)

  for group in df.groupby(['User_ID']):


Unnamed: 0,User_ID,Datetime,Text,Diff_in_Seconds,diff_abs,same_author,messageId
0,U01J1EA8JN7,2022-12-07 14:42:41,Gente alguien sabe como se le puede agregar a ...,0.0,0.0,0,1
1,U01SJ480RBR,2022-11-07 16:59:39,Hello,0.0,0.0,1,2
2,U01SM5J4MMG,2022-11-07 21:15:26,"Buenas tardes, intenta poniendo solo git push",0.0,0.0,1,3
3,U026XFJHQPK,2022-12-14 13:34:18,"Hello, good afternoon for the people in europe...",0.0,0.0,1,4
4,U02G5B470B1,2022-11-15 19:07:30,"Hello, where do I found the webpacks options a...",0.0,0.0,1,5
...,...,...,...,...,...,...,...
157,U04GCSDB561,2023-01-02 17:54:14,Hola buenas tardes a todos. Estoy en el primer...,263262.0,263262.0,0,135
158,U04GCSDB561,2023-01-03 15:35:26,"Hola, buenos dias. Estoy tratando de que mis i...",78072.0,78072.0,0,136
159,U04GEJB6GBX,2023-01-02 07:47:22,Hola! Estoy teniendo problemas para hacer un c...,0.0,0.0,1,137
160,U6MR8LG4Q,2022-11-05 03:11:30,Hey @betomasia12 No need to make the pictures ...,0.0,0.0,1,138


In [36]:
df_questions.shape

(162, 7)

In [33]:
df_questions.head(5)

Unnamed: 0,User_ID,Datetime,Text,Diff_in_Seconds,diff_abs,same_author,messageId
0,U01KGAER1TM,2022-11-04 16:14:32,"Como estan, tengo un problema, ocupo subir una...",0.0,0.0,0,1
1,U01SJ480RBR,2022-11-07 16:59:39,Hello,0.0,0.0,1,2
2,U01SM5J4MMG,2022-11-07 21:15:26,"Buenas tardes, intenta poniendo solo git push",0.0,0.0,1,3
3,U02N1P8CV6W,2022-10-31 22:55:36,"Hola chicos, he estado intentando llevar a pro...",0.0,0.0,1,4
4,U02NE11UHNC,2022-11-03 22:28:10,have someone imported custom fonts (no google ...,0.0,0.0,1,5


**Merge each dataframe to its previous columns**

In [23]:
df_answers = df_answers.merge(A_df, how = 'left', left_on = ['User_ID', 'Datetime', 'Text'],
    right_on = ['User_ID', 'Datetime', 'Text']).drop(['Diff_in_Seconds','diff_abs','same_author','Text_raw'], axis=1)

In [24]:
df_answers['Response_time'] = df_answers['Datetime'] - df_answers['Datetime_Thread']

In [25]:
df_answers.shape

(337, 19)

In [37]:
df_questions = df_questions.merge(Q_df, how = 'left', left_on = ['User_ID', 'Datetime', 'Text'],
    right_on = ['User_ID', 'Datetime', 'Text']).drop(['Diff_in_Seconds','diff_abs','same_author','Text_raw'], axis=1)

In [38]:
df_questions.shape

(162, 18)

**Merge text and timestamps in rows that have the same messageId**

In [39]:
df_answers['Text'] = df_answers.groupby(['messageId'])['Text'].transform(lambda x : ' '.join(x))
df_questions['Text'] = df_questions.groupby(['messageId'])['Text'].transform(lambda x : ' '.join(x))

In [40]:
df_questions.dropna(axis=1, how='all', inplace=True)
df_questions['Timestamp'] = df_questions.groupby(['messageId'])['Timestamp'].transform(lambda x : ','.join(map(str, x)))

In [41]:
#rename to ids in both dataframes
df_questions.rename(columns={"Timestamp": "Question_ID", "Text":"Question_Text"}, inplace=True)
df_answers.rename(columns={"Timestamp": "Answer_ID", "Timestamp_Thread": "Key_to_Question_ID",
                    "User_ID":"Answer_User_ID","Full_Name":"Answer_Full_Name","Email":"Answer_email","Text":"Answer_Text","Is_agent":"Answer_from_Agent",
                    "Datetime":"Answer_Datetime", "Datetime_Thread":"Answer_Dt_Thread"},inplace=True)

In [42]:
#Drop duplicates
df_questions = df_questions.drop_duplicates(subset=["Question_Text","Question_ID"],keep='first')
df_answers = df_answers.drop_duplicates(subset=["Answer_Text"],keep='first')

In [43]:
df_answers.shape

(235, 19)

In [44]:
df_questions.shape

(139, 16)

### Combining both dataframes

In [45]:
#create a list from df3 question_id column
question_ids_list = df_questions['Question_ID'].tolist()

In [46]:
def id_autocompletion(search):
    for id in question_ids_list:
        if search in id:
            return id
    
    return None

In [47]:
df_answers['Key_to_Question_ID'] =  df_answers['Key_to_Question_ID'].apply(id_autocompletion)

In [48]:
df_answers['Key_to_Question_ID'] =  np.where(df_answers['Key_to_Question_ID'].isnull(), str(df_answers['Answer_Dt_Thread']), df_answers['Key_to_Question_ID'])

In [49]:
df_answers.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 235 entries, 0 to 335
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype          
---  ------              --------------  -----          
 0   Answer_User_ID      235 non-null    object         
 1   Answer_Datetime     235 non-null    datetime64[ns] 
 2   Answer_Text         235 non-null    object         
 3   messageId           235 non-null    int64          
 4   Channel_ID          235 non-null    object         
 5   Channel_Slug        235 non-null    object         
 6   Answer_ID           235 non-null    object         
 7   Key_to_Question_ID  235 non-null    object         
 8   Answer_Full_Name    233 non-null    object         
 9   Answer_email        235 non-null    object         
 10  Permalink           235 non-null    object         
 11  Slack_username      235 non-null    object         
 12  Team_ID             235 non-null    object         
 13  Team_Name           235 non-null   

In [50]:
#verifying it worked
df_answers[['Answer_User_ID','Answer_ID','Key_to_Question_ID', 'Answer_Datetime', 'Answer_Dt_Thread']]

Unnamed: 0,Answer_User_ID,Answer_ID,Key_to_Question_ID,Answer_Datetime,Answer_Dt_Thread
0,U015XDM7KRA,11/25/2022 14:32:52,0 2022-11-25 11:26:19\n1 2022-12-09 21...,2022-11-25 14:32:52,2022-11-25 11:26:19
1,U01GE2A7HV4,12/9/2022 22:13:54,0 2022-11-25 11:26:19\n1 2022-12-09 21...,2022-12-09 22:13:54,2022-12-09 21:31:44
2,U01GE2A7HV4,12/9/2022 22:19:48,0 2022-11-25 11:26:19\n1 2022-12-09 21...,2022-12-09 22:19:48,2022-12-09 21:31:44
3,U01PE4CNXN1,11/30/2022 11:33:40,11/30/2022 10:03:47,2022-11-30 11:33:40,2022-11-30 10:03:47
4,U01PE4CNXN1,11/30/2022 11:39:41,11/30/2022 10:03:47,2022-11-30 11:39:41,2022-11-30 10:03:47
...,...,...,...,...,...
331,UPRFXC3QE,12/6/2022 20:51:46,12/4/2022 0:46:13,2022-12-06 20:51:46,2022-12-04 00:46:13
332,UPRFXC3QE,12/6/2022 21:10:39,12/4/2022 0:46:13,2022-12-06 21:10:39,2022-12-04 00:46:13
333,UPRFXC3QE,12/7/2022 16:28:05,12/4/2022 0:46:13,2022-12-07 16:28:05,2022-12-04 00:46:13
334,UPRFXC3QE,12/9/2022 13:47:04,12/9/2022 2:34:55,2022-12-09 13:47:04,2022-12-09 02:34:55


In [49]:
#Saving to csv

df_questions.to_csv('../output/questions.csv')

df_answers.to_csv('../output/answers.csv')

In [51]:
final_df = pd.merge(df_questions, df_answers[['Answer_User_ID','Answer_Full_Name','Answer_email','Answer_from_Agent','Answer_Text','Answer_ID','Key_to_Question_ID',
            'Answer_Datetime','Answer_Dt_Thread','Response_time']], how = 'left', left_on = ['Question_ID'], right_on = ['Key_to_Question_ID'])

In [52]:
final_df.shape

(213, 26)

In [53]:
final_df

Unnamed: 0,User_ID,Datetime,Question_Text,messageId,Channel_ID,Channel_Slug,Question_ID,Full_Name,Email,Permalink,...,Answer_User_ID,Answer_Full_Name,Answer_email,Answer_from_Agent,Answer_Text,Answer_ID,Key_to_Question_ID,Answer_Datetime,Answer_Dt_Thread,Response_time
0,U01J1EA8JN7,2022-12-07 14:42:41,Gente alguien sabe como se le puede agregar a ...,1,CAZ9W99U4,public-support-full-stack,12/7/2022 14:42:41,LucasVY,Lucas.varasy@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,...,,,,,,,,NaT,NaT,NaT
1,U01SJ480RBR,2022-11-07 16:59:39,Hello,2,CAZ9W99U4,public-support-full-stack,11/7/2022 16:59:39,Maikol Moreira,Niukeitor@hotmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,...,,,,,,,,NaT,NaT,NaT
2,U01SM5J4MMG,2022-11-07 21:15:26,"Buenas tardes, intenta poniendo solo git push",3,CAZ9W99U4,public-support-full-stack,11/7/2022 21:15:26,Alvaro Javier Chagas Capurro,alvarojavierchagas@hotmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,...,U01SM5J4MMG,Alvaro Javier Chagas Capurro,alvarojavierchagas@hotmail.com,0.0,BUen dia pudiste solucionar?? Podrias pasarme ...,11/8/2022 14:09:39,11/7/2022 21:15:26,2022-11-08 14:09:39,2022-11-07 21:15:26,0 days 16:54:13
3,U026XFJHQPK,2022-12-14 13:34:18,"Hello, good afternoon for the people in europe...",4,CAZ9W99U4,public-support-full-stack,12/14/2022 13:34:18,Nicola Martinez Clemente,nicoenrique_1994@hotmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,...,,,,,,,,NaT,NaT,NaT
4,U02G5B470B1,2022-11-15 19:07:30,"Hello, where do I found the webpacks options a...",5,CAZ9W99U4,public-support-full-stack,11/15/2022 19:07:30,Facundo Gul dos Santos,facundogds@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,...,U02G5B470B1,Facundo Gul dos Santos,facundogds@gmail.com,0.0,"Thanks, mate, it's was so simple but I've forg...",11/15/2022 19:28:18,11/15/2022 19:07:30,2022-11-15 19:28:18,2022-11-15 19:07:30,0 days 00:20:48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
208,U04GCSDB561,2023-01-03 15:35:26,"Hola, buenos dias. Estoy tratando de que mis i...",136,CAZ9W99U4,public-support-full-stack,1/3/2023 15:35:26,Miguel Ramirez,miguelramirez2210@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,...,U04DZEKGP28,Hans Aparicio,hansaparicio@gmail.com,0.0,a tu orden avisame si necesitas mas ayuda,1/3/2023 16:04:10,1/3/2023 15:35:26,2023-01-03 16:04:10,2023-01-03 15:35:26,0 days 00:28:44
209,U04GCSDB561,2023-01-03 15:35:26,"Hola, buenos dias. Estoy tratando de que mis i...",136,CAZ9W99U4,public-support-full-stack,1/3/2023 15:35:26,Miguel Ramirez,miguelramirez2210@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,...,U04GCSDB561,Miguel Ramirez,miguelramirez2210@gmail.com,0.0,Gracias Hans!,1/3/2023 16:03:06,1/3/2023 15:35:26,2023-01-03 16:03:06,2023-01-03 15:35:26,0 days 00:27:40
210,U04GEJB6GBX,2023-01-02 07:47:22,Hola! Estoy teniendo problemas para hacer un c...,137,CAZ9W99U4,public-support-full-stack,1/2/2023 7:47:22,Eduardo Andrade,edu2andrade@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,...,U04GEJB6GBX,Eduardo Andrade,edu2andrade@gmail.com,0.0,Ya esta arreglado :man-facepalming: jaja,1/2/2023 10:47:50,1/2/2023 7:47:22,2023-01-02 10:47:50,2023-01-02 07:47:22,0 days 03:00:28
211,U6MR8LG4Q,2022-11-05 03:11:30,Hey @betomasia12 No need to make the pictures ...,138,CAZ9W99U4,public-support-full-stack,11/5/2022 3:11:30,Marcelo Ricigliano,mricigliano@4geeksacademy.com,https://4geeksacademy.slack.com/archives/CAZ9W...,...,,,,,,,,NaT,NaT,NaT


In [51]:
final_df.to_csv('../output/final_df.csv')