#### **Public Support channel data cleaning**

**Install and import libraries**

In [1]:
%%capture
pip install numpy pandas

In [2]:
import numpy as np
import pandas as pd

**Load the data**

In [3]:
df_slack = pd.read_csv('../sources/support_channels.csv')

**Basic information about the data**

In [4]:
print('Shape of the dataframe before cleaning:', df_slack.shape)

Shape of the dataframe before cleaning: (500, 14)


In [5]:
df_slack.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Channel_ID        500 non-null    object
 1   Channel_Slug      500 non-null    object
 2   Timestamp         500 non-null    object
 3   Timestamp_Thread  337 non-null    object
 4   User_ID           500 non-null    object
 5   Full_Name         498 non-null    object
 6   Email             500 non-null    object
 7   Permalink         500 non-null    object
 8   Text              500 non-null    object
 9   Text_raw          491 non-null    object
 10  Slack_username    500 non-null    object
 11  Team_ID           500 non-null    object
 12  Team_Name         500 non-null    object
 13  Is_Bot            500 non-null    bool  
dtypes: bool(1), object(13)
memory usage: 51.4+ KB


In [6]:
df_slack.sample(5)

Unnamed: 0,Channel_ID,Channel_Slug,Timestamp,Timestamp_Thread,User_ID,Full_Name,Email,Permalink,Text,Text_raw,Slack_username,Team_ID,Team_Name,Is_Bot
291,CAZ9W99U4,public-support-full-stack,11/26/2022 2:52:16,11/26/2022 2:42:32,U04B00WBB7B,Francisca Morales Moore,fraanmoore@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,@dmoret17 cómo se haría eso?,<@U03RFM3C7DL> cómo se haría eso?,fraanmoore,T0BFXMWMV,4Geeks Academy,False
64,CAZ9W99U4,public-support-full-stack,11/11/2022 16:12:26,,U03A7FCVDMW,Richard Jardine,rhjardine@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,"Hello. Please, I require guidance regarding th...","Hello. Please, I require guidance regarding th...",rhjardine,T0BFXMWMV,4Geeks Academy,False
189,CAZ9W99U4,public-support-full-stack,11/19/2022 18:35:44,,U042JTZUFAN,Carlos Velazquez,95.carlos.velazquez@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,Buenas tardes!!\n\nHe estado practicando los e...,Buenas tardes!!\n\nHe estado practicando los e...,95.carlos.velazquez,T0BFXMWMV,4Geeks Academy,False
92,CAZ9W99U4,public-support-full-stack,12/3/2022 2:44:06,12/2/2022 20:13:07,U03F8RRA9TR,Bairon Navia,bairon97@icloud.com,https://4geeksacademy.slack.com/archives/CAZ9W...,tengo que pedir una mentoria para esto ?,tengo que pedir una mentoria para esto ?,bairon97,T0BFXMWMV,4Geeks Academy,False
67,CAZ9W99U4,public-support-full-stack,11/3/2022 14:50:07,11/3/2022 5:48:56,U03ACPHUXL7,Tyson Roussel,tysonr0319@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,"Hey, did you double check that you have both c...","Hey, did you double check that you have both c...",tysonr0319,T0BFXMWMV,4Geeks Academy,False


In [7]:
df_slack['Channel_ID'].value_counts()

CAZ9W99U4    500
Name: Channel_ID, dtype: int64

In [8]:
df_slack['Channel_Slug'].value_counts()

public-support-full-stack    500
Name: Channel_Slug, dtype: int64

In [9]:
df_slack['Team_ID'].value_counts()

T0BFXMWMV    500
Name: Team_ID, dtype: int64

In [10]:
df_slack['Team_Name'].value_counts()

4Geeks Academy    500
Name: Team_Name, dtype: int64

In [11]:
# Estas cuatro columnas no aportan información, por lo tanto considero apropiado eliminarlas.

df_slack.drop(columns=['Channel_ID', 'Channel_Slug', 'Team_ID', 'Team_Name'], inplace=True)

**Create, encode and transform columns**

In [12]:
df_slack['Is_a_question'] = np.where(df_slack['Timestamp_Thread'].isnull(), 1, 0)

In [13]:
# support_agent_ids = ['1','5301'] 
# No coincide el ID, por lo menos en el caso de Alejandro es UL08NNSV8
# support_agent_names = ['Alejandro Sanchez','Tomas Gonzalez']
# Es posible que a futuro se sumen otras personas con igual nombre. Considero más seguro usar el mail como referencia

support_agent_emails = ['aalejo@gmail.com', 'tgonzalez@4geeksacademy.com']

df_slack['Is_agent'] = np.where(df_slack['Email'].isin(support_agent_emails), 1, 0)

In [14]:
df_slack['Is_Bot'] = np.where(df_slack['Is_Bot'] == True, 1, 0)

In [15]:
df_slack['Datetime'] = pd.to_datetime(df_slack['Timestamp'])
df_slack['Datetime_Thread'] = pd.to_datetime(df_slack['Timestamp_Thread'])

**Create a dataframe for questions and another one for answers**

In [16]:
Q_df = df_slack[df_slack['Is_a_question'] == 1]
A_df = df_slack[df_slack['Is_a_question'] == 0]

**Perform transformations on the Questions dataframe**

In [17]:
questions = Q_df.groupby(['User_ID','Datetime'])[['Text']]

df_questions = pd.DataFrame(questions.sum().reset_index())

In [18]:
df_questions.head()

Unnamed: 0,User_ID,Datetime,Text
0,U01J1EA8JN7,2022-12-07 14:42:41,Gente alguien sabe como se le puede agregar a ...
1,U01SJ480RBR,2022-11-07 16:59:39,Hello
2,U01SM5J4MMG,2022-11-07 21:15:26,"Buenas tardes, intenta poniendo solo git push"
3,U026XFJHQPK,2022-12-14 13:34:18,"Hello, good afternoon for the people in europe..."
4,U02G5B470B1,2022-11-15 19:07:30,"Hello, where do I found the webpacks options a..."


In [19]:
df_questions.shape

(162, 3)

In [20]:
df_questions['Diff_in_seconds'] = (df_questions.sort_values('Datetime').groupby('User_ID').Datetime.diff())

In [21]:
df_questions['Diff_in_seconds'] = df_questions['Diff_in_seconds'].fillna(pd.Timedelta(seconds=0))

In [22]:
df_questions['Diff_in_seconds']=df_questions['Diff_in_seconds']/np.timedelta64(1,'s')

In [23]:
df_questions['Diff_abs'] = df_questions['Diff_in_seconds'].abs()

In [24]:
df_questions['Not_previous_author'] = df_questions['User_ID'].ne(df_questions['User_ID'].shift().bfill()).astype(int)

In [25]:
def create_QuestionId(df):
    for group in df.groupby(['User_ID']):
        df['Q_message_ID'] = df['Diff_abs'].gt(300).cumsum() + 1 + df['Not_previous_author'].cumsum()
    return df

create_QuestionId(df_questions)

  for group in df.groupby(['User_ID']):


Unnamed: 0,User_ID,Datetime,Text,Diff_in_seconds,Diff_abs,Not_previous_author,Q_message_ID
0,U01J1EA8JN7,2022-12-07 14:42:41,Gente alguien sabe como se le puede agregar a ...,0.0,0.0,0,1
1,U01SJ480RBR,2022-11-07 16:59:39,Hello,0.0,0.0,1,2
2,U01SM5J4MMG,2022-11-07 21:15:26,"Buenas tardes, intenta poniendo solo git push",0.0,0.0,1,3
3,U026XFJHQPK,2022-12-14 13:34:18,"Hello, good afternoon for the people in europe...",0.0,0.0,1,4
4,U02G5B470B1,2022-11-15 19:07:30,"Hello, where do I found the webpacks options a...",0.0,0.0,1,5
...,...,...,...,...,...,...,...
157,U04GCSDB561,2023-01-02 17:54:14,Hola buenas tardes a todos. Estoy en el primer...,263262.0,263262.0,0,135
158,U04GCSDB561,2023-01-03 15:35:26,"Hola, buenos dias. Estoy tratando de que mis i...",78072.0,78072.0,0,136
159,U04GEJB6GBX,2023-01-02 07:47:22,Hola! Estoy teniendo problemas para hacer un c...,0.0,0.0,1,137
160,U6MR8LG4Q,2022-11-05 03:11:30,Hey @betomasia12 No need to make the pictures ...,0.0,0.0,1,138


In [26]:
df_questions.shape

(162, 7)

In [27]:
# Merge the dataframe to its previous columns
df_questions = df_questions.merge(Q_df, how = 'left', left_on = ['User_ID', 'Datetime', 'Text'],
    right_on = ['User_ID', 'Datetime', 'Text']).drop(['Diff_in_seconds','Diff_abs','Not_previous_author','Text_raw'], axis=1)

In [28]:
df_questions.shape

(162, 14)

In [29]:
# Example before merging text
df_questions[df_questions['Q_message_ID']==131]

Unnamed: 0,User_ID,Datetime,Text,Q_message_ID,Timestamp,Timestamp_Thread,Full_Name,Email,Permalink,Slack_username,Is_Bot,Is_a_question,Is_agent,Datetime_Thread
150,U04F75G4970,2022-12-27 20:28:37,"Hola buenas, alguien me podría ayudar? Al agre...",131,12/27/2022 20:28:37,,Ignacio,ignacio.damanes@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,ignacio.damanes,0.0,1.0,0.0,NaT
151,U04F75G4970,2022-12-27 20:28:44,image.png,131,12/27/2022 20:28:44,,Ignacio,ignacio.damanes@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,ignacio.damanes,0.0,1.0,0.0,NaT
152,U04F75G4970,2022-12-27 20:29:15,image.png,131,12/27/2022 20:29:15,,Ignacio,ignacio.damanes@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,ignacio.damanes,0.0,1.0,0.0,NaT
153,U04F75G4970,2022-12-27 20:29:19,Gracias,131,12/27/2022 20:29:19,,Ignacio,ignacio.damanes@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,ignacio.damanes,0.0,1.0,0.0,NaT


In [30]:
# Merge text in rows that have the same Q_message_ID
df_questions['Text'] = df_questions.groupby(['Q_message_ID'])['Text'].transform(lambda x : ' '.join(x))

In [31]:
# Example after merging text
df_questions[df_questions['Q_message_ID']==131]

Unnamed: 0,User_ID,Datetime,Text,Q_message_ID,Timestamp,Timestamp_Thread,Full_Name,Email,Permalink,Slack_username,Is_Bot,Is_a_question,Is_agent,Datetime_Thread
150,U04F75G4970,2022-12-27 20:28:37,"Hola buenas, alguien me podría ayudar? Al agre...",131,12/27/2022 20:28:37,,Ignacio,ignacio.damanes@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,ignacio.damanes,0.0,1.0,0.0,NaT
151,U04F75G4970,2022-12-27 20:28:44,"Hola buenas, alguien me podría ayudar? Al agre...",131,12/27/2022 20:28:44,,Ignacio,ignacio.damanes@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,ignacio.damanes,0.0,1.0,0.0,NaT
152,U04F75G4970,2022-12-27 20:29:15,"Hola buenas, alguien me podría ayudar? Al agre...",131,12/27/2022 20:29:15,,Ignacio,ignacio.damanes@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,ignacio.damanes,0.0,1.0,0.0,NaT
153,U04F75G4970,2022-12-27 20:29:19,"Hola buenas, alguien me podría ayudar? Al agre...",131,12/27/2022 20:29:19,,Ignacio,ignacio.damanes@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,ignacio.damanes,0.0,1.0,0.0,NaT


In [32]:
# Delete empty columns
df_questions.dropna(axis=1, how='all', inplace=True)

In [33]:
df_questions.shape

(162, 12)

In [34]:
# Merge timestamp in rows that have the same Q_message_ID
df_questions['Timestamp'] = df_questions.groupby(['Q_message_ID'])['Timestamp'].transform(lambda x : ','.join(map(str, x)))

In [35]:
# Example after merging timestamp
df_questions[df_questions['Q_message_ID']==131]

Unnamed: 0,User_ID,Datetime,Text,Q_message_ID,Timestamp,Full_Name,Email,Permalink,Slack_username,Is_Bot,Is_a_question,Is_agent
150,U04F75G4970,2022-12-27 20:28:37,"Hola buenas, alguien me podría ayudar? Al agre...",131,"12/27/2022 20:28:37,12/27/2022 20:28:44,12/27/...",Ignacio,ignacio.damanes@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,ignacio.damanes,0.0,1.0,0.0
151,U04F75G4970,2022-12-27 20:28:44,"Hola buenas, alguien me podría ayudar? Al agre...",131,"12/27/2022 20:28:37,12/27/2022 20:28:44,12/27/...",Ignacio,ignacio.damanes@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,ignacio.damanes,0.0,1.0,0.0
152,U04F75G4970,2022-12-27 20:29:15,"Hola buenas, alguien me podría ayudar? Al agre...",131,"12/27/2022 20:28:37,12/27/2022 20:28:44,12/27/...",Ignacio,ignacio.damanes@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,ignacio.damanes,0.0,1.0,0.0
153,U04F75G4970,2022-12-27 20:29:19,"Hola buenas, alguien me podría ayudar? Al agre...",131,"12/27/2022 20:28:37,12/27/2022 20:28:44,12/27/...",Ignacio,ignacio.damanes@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,ignacio.damanes,0.0,1.0,0.0


In [36]:
# Rename columns to make it explicit that they correspond to questions
df_questions.rename(columns={'User_ID':'Q_User_ID', 'Datetime':'Q_Datetime', 'Text':'Q_Text', 
                            'Timestamp':'Q_Timestamp', 'Full_Name':'Q_Full_Name', 'Email':'Q_Email',
                            'Permalink':'Q_Permalink', 'Slack_username':'Q_Slack_username', 
                            'Is_agent':'Q_from_Agent'},inplace=True)

In [37]:
# Drop duplicates
df_questions.drop_duplicates(subset=["Q_Timestamp", "Q_Text"], keep='first', inplace=True)

In [38]:
df_questions.shape

(139, 12)

**Perform transformations on the Answers dataframe**

In [39]:
answers = A_df.groupby(['User_ID','Datetime'])[['Text']]

df_answers = pd.DataFrame(answers.sum().reset_index())

In [40]:
df_answers.head()

Unnamed: 0,User_ID,Datetime,Text
0,U015XDM7KRA,2022-11-25 14:32:52,no te sirve redux o el context api ? por que f...
1,U01GE2A7HV4,2022-12-09 22:13:54,The statement indicates that you must give val...
2,U01GE2A7HV4,2022-12-09 22:19:48,Hint: In line 18 you have the element that you...
3,U01PE4CNXN1,2022-11-30 11:33:40,"Yo te recomiendo netifly, es muy fácil de usar"
4,U01PE4CNXN1,2022-11-30 11:39:41,"Creo que sí, pero ahora mismo me haces dudar..."


In [41]:
df_answers.shape

(337, 3)

In [42]:
df_answers['Diff_in_seconds'] = (df_answers.sort_values('Datetime').groupby('User_ID').Datetime.diff())

In [43]:
df_answers['Diff_in_seconds'] = df_answers['Diff_in_seconds'].fillna(pd.Timedelta(seconds=0))

In [44]:
df_answers['Diff_in_seconds']=df_answers['Diff_in_seconds']/np.timedelta64(1,'s')

In [45]:
df_answers['Diff_abs'] = df_answers['Diff_in_seconds'].abs()

In [46]:
df_answers['Not_previous_author'] = df_answers['User_ID'].ne(df_answers['User_ID'].shift().bfill()).astype(int)

In [47]:
def create_AnswerId(df):
    for group in df.groupby(['User_ID']):
        df['A_message_ID'] = df['Diff_abs'].gt(300).cumsum() + 1 + df['Not_previous_author'].cumsum()
    return df

create_AnswerId(df_answers)

  for group in df.groupby(['User_ID']):


Unnamed: 0,User_ID,Datetime,Text,Diff_in_seconds,Diff_abs,Not_previous_author,A_message_ID
0,U015XDM7KRA,2022-11-25 14:32:52,no te sirve redux o el context api ? por que f...,0.0,0.0,0,1
1,U01GE2A7HV4,2022-12-09 22:13:54,The statement indicates that you must give val...,0.0,0.0,1,2
2,U01GE2A7HV4,2022-12-09 22:19:48,Hint: In line 18 you have the element that you...,354.0,354.0,0,3
3,U01PE4CNXN1,2022-11-30 11:33:40,"Yo te recomiendo netifly, es muy fácil de usar",0.0,0.0,1,4
4,U01PE4CNXN1,2022-11-30 11:39:41,"Creo que sí, pero ahora mismo me haces dudar...",361.0,361.0,0,5
...,...,...,...,...,...,...,...
332,UPRFXC3QE,2022-12-06 21:10:39,There should be a free version. Check the othe...,1133.0,1133.0,0,234
333,UPRFXC3QE,2022-12-07 16:28:05,I dont know fly that well. You would need to r...,69446.0,69446.0,0,235
334,UPRFXC3QE,2022-12-09 13:47:04,Hi. So the error is showing you do not have t...,163139.0,163139.0,0,236
335,UU409472Q,2022-11-02 11:33:54,"hay dos opciones\ne.key o e.keycode , en ambas...",0.0,0.0,1,237


In [48]:
df_answers.shape

(337, 7)

In [49]:
# Merge the dataframe to its previous columns
df_answers = df_answers.merge(A_df, how = 'left', left_on = ['User_ID', 'Datetime', 'Text'],
    right_on = ['User_ID', 'Datetime', 'Text']).drop(['Diff_in_seconds','Diff_abs','Not_previous_author','Text_raw'], axis=1)

In [50]:
df_answers['Response_time'] = df_answers['Datetime'] - df_answers['Datetime_Thread']

In [51]:
df_answers.shape

(337, 15)

In [52]:
# Example before merging text
df_answers[df_answers['A_message_ID']==7]

Unnamed: 0,User_ID,Datetime,Text,A_message_ID,Timestamp,Timestamp_Thread,Full_Name,Email,Permalink,Slack_username,Is_Bot,Is_a_question,Is_agent,Datetime_Thread,Response_time
7,U01RCAJB6ES,2022-12-04 11:26:25,Pero no lo pongas entre llaves,7,12/4/2022 11:26:25,12/4/2022 10:46:37,David Berdiell sanchez,david_berdiell@hotmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,david_berdiell,0,0,0,2022-12-04 10:46:37,0 days 00:39:48
8,U01RCAJB6ES,2022-12-04 11:29:22,element.lucky_numbers.forEach(number =&gt; { \...,7,12/4/2022 11:29:22,12/4/2022 10:46:37,David Berdiell sanchez,david_berdiell@hotmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,david_berdiell,0,0,0,2022-12-04 10:46:37,0 days 00:42:45
9,U01RCAJB6ES,2022-12-04 11:29:28,Algo así sería,7,12/4/2022 11:29:28,12/4/2022 10:46:37,David Berdiell sanchez,david_berdiell@hotmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,david_berdiell,0,0,0,2022-12-04 10:46:37,0 days 00:42:51
10,U01RCAJB6ES,2022-12-04 11:32:56,"Aún así, no se suele usar for each, usa map pa...",7,12/4/2022 11:32:56,12/4/2022 10:46:37,David Berdiell sanchez,david_berdiell@hotmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,david_berdiell,0,0,0,2022-12-04 10:46:37,0 days 00:46:19


In [53]:
# Merge text in rows that have the same A_message_ID
df_answers['Text'] = df_answers.groupby(['A_message_ID'])['Text'].transform(lambda x : ' '.join(x))

In [54]:
# Example after merging text
df_answers[df_answers['A_message_ID']==7]

Unnamed: 0,User_ID,Datetime,Text,A_message_ID,Timestamp,Timestamp_Thread,Full_Name,Email,Permalink,Slack_username,Is_Bot,Is_a_question,Is_agent,Datetime_Thread,Response_time
7,U01RCAJB6ES,2022-12-04 11:26:25,Pero no lo pongas entre llaves element.lucky_n...,7,12/4/2022 11:26:25,12/4/2022 10:46:37,David Berdiell sanchez,david_berdiell@hotmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,david_berdiell,0,0,0,2022-12-04 10:46:37,0 days 00:39:48
8,U01RCAJB6ES,2022-12-04 11:29:22,Pero no lo pongas entre llaves element.lucky_n...,7,12/4/2022 11:29:22,12/4/2022 10:46:37,David Berdiell sanchez,david_berdiell@hotmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,david_berdiell,0,0,0,2022-12-04 10:46:37,0 days 00:42:45
9,U01RCAJB6ES,2022-12-04 11:29:28,Pero no lo pongas entre llaves element.lucky_n...,7,12/4/2022 11:29:28,12/4/2022 10:46:37,David Berdiell sanchez,david_berdiell@hotmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,david_berdiell,0,0,0,2022-12-04 10:46:37,0 days 00:42:51
10,U01RCAJB6ES,2022-12-04 11:32:56,Pero no lo pongas entre llaves element.lucky_n...,7,12/4/2022 11:32:56,12/4/2022 10:46:37,David Berdiell sanchez,david_berdiell@hotmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,david_berdiell,0,0,0,2022-12-04 10:46:37,0 days 00:46:19


In [55]:
# Merge timestamp in rows that have the same A_message_ID
df_answers['Timestamp'] = df_answers.groupby(['A_message_ID'])['Timestamp'].transform(lambda x : ','.join(map(str, x)))

In [56]:
# Example after merging timestamp
df_answers[df_answers['A_message_ID']==7]

Unnamed: 0,User_ID,Datetime,Text,A_message_ID,Timestamp,Timestamp_Thread,Full_Name,Email,Permalink,Slack_username,Is_Bot,Is_a_question,Is_agent,Datetime_Thread,Response_time
7,U01RCAJB6ES,2022-12-04 11:26:25,Pero no lo pongas entre llaves element.lucky_n...,7,"12/4/2022 11:26:25,12/4/2022 11:29:22,12/4/202...",12/4/2022 10:46:37,David Berdiell sanchez,david_berdiell@hotmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,david_berdiell,0,0,0,2022-12-04 10:46:37,0 days 00:39:48
8,U01RCAJB6ES,2022-12-04 11:29:22,Pero no lo pongas entre llaves element.lucky_n...,7,"12/4/2022 11:26:25,12/4/2022 11:29:22,12/4/202...",12/4/2022 10:46:37,David Berdiell sanchez,david_berdiell@hotmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,david_berdiell,0,0,0,2022-12-04 10:46:37,0 days 00:42:45
9,U01RCAJB6ES,2022-12-04 11:29:28,Pero no lo pongas entre llaves element.lucky_n...,7,"12/4/2022 11:26:25,12/4/2022 11:29:22,12/4/202...",12/4/2022 10:46:37,David Berdiell sanchez,david_berdiell@hotmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,david_berdiell,0,0,0,2022-12-04 10:46:37,0 days 00:42:51
10,U01RCAJB6ES,2022-12-04 11:32:56,Pero no lo pongas entre llaves element.lucky_n...,7,"12/4/2022 11:26:25,12/4/2022 11:29:22,12/4/202...",12/4/2022 10:46:37,David Berdiell sanchez,david_berdiell@hotmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,david_berdiell,0,0,0,2022-12-04 10:46:37,0 days 00:46:19


In [57]:
# Rename columns to make it explicit that they correspond to answers
df_answers.rename(columns={'User_ID':'A_User_ID', 'Datetime':'A_Datetime', 'Text':'A_Text', 'Timestamp':'A_Timestamp', 
                            'Timestamp_Thread':'Key_to_Q_Timestamp', 'Full_Name':'A_Full_Name', 'Email':'A_Email', 
                            'Permalink':'A_Permalink', 'Slack_username':'A_Slack_username', 'Is_agent':'A_from_Agent',
                            'Datetime_Thread':'A_Datetime_Thread'},inplace=True)

In [58]:
df_answers = df_answers.drop_duplicates(subset=['A_Timestamp', 'A_Text'], keep='first')

In [59]:
df_answers.shape

(237, 15)

**Combine both dataframes**

In [60]:
question_ids_list = df_questions['Q_Timestamp'].tolist()

In [61]:
def id_autocompletion(search):
    for id in question_ids_list:
        if search in id:
            return id
    
    return None

In [62]:
df_answers['Key_to_Q_Timestamp'] =  df_answers['Key_to_Q_Timestamp'].apply(id_autocompletion)

In [63]:
df_answers['Key_to_Q_Timestamp'] =  np.where(df_answers['Key_to_Q_Timestamp'].isnull(), str(df_answers['A_Datetime_Thread']), df_answers['Key_to_Q_Timestamp'])

In [64]:
final_df = pd.merge(df_questions, df_answers, left_on = ['Q_Timestamp'], right_on = ['Key_to_Q_Timestamp'], how = 'left')

In [65]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 214 entries, 0 to 213
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype          
---  ------              --------------  -----          
 0   Q_User_ID           214 non-null    object         
 1   Q_Datetime          214 non-null    datetime64[ns] 
 2   Q_Text              214 non-null    object         
 3   Q_message_ID        214 non-null    int64          
 4   Q_Timestamp         214 non-null    object         
 5   Q_Full_Name         213 non-null    object         
 6   Q_Email             213 non-null    object         
 7   Q_Permalink         213 non-null    object         
 8   Q_Slack_username    213 non-null    object         
 9   Is_Bot_x            213 non-null    float64        
 10  Is_a_question_x     213 non-null    float64        
 11  Q_from_Agent        213 non-null    float64        
 12  A_User_ID           150 non-null    object         
 13  A_Datetime          150 non-null   

In [66]:
final_df.head()

Unnamed: 0,Q_User_ID,Q_Datetime,Q_Text,Q_message_ID,Q_Timestamp,Q_Full_Name,Q_Email,Q_Permalink,Q_Slack_username,Is_Bot_x,...,Key_to_Q_Timestamp,A_Full_Name,A_Email,A_Permalink,A_Slack_username,Is_Bot_y,Is_a_question_y,A_from_Agent,A_Datetime_Thread,Response_time
0,U01J1EA8JN7,2022-12-07 14:42:41,Gente alguien sabe como se le puede agregar a ...,1,12/7/2022 14:42:41,LucasVY,Lucas.varasy@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,lucas.varasy,0.0,...,,,,,,,,,NaT,NaT
1,U01SJ480RBR,2022-11-07 16:59:39,Hello,2,11/7/2022 16:59:39,Maikol Moreira,Niukeitor@hotmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,niukeitor,0.0,...,,,,,,,,,NaT,NaT
2,U01SM5J4MMG,2022-11-07 21:15:26,"Buenas tardes, intenta poniendo solo git push",3,11/7/2022 21:15:26,Alvaro Javier Chagas Capurro,alvarojavierchagas@hotmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,alvarojavierchagas,0.0,...,11/7/2022 21:15:26,Alvaro Javier Chagas Capurro,alvarojavierchagas@hotmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,alvarojavierchagas,0.0,0.0,0.0,2022-11-07 21:15:26,0 days 16:54:13
3,U026XFJHQPK,2022-12-14 13:34:18,"Hello, good afternoon for the people in europe...",4,12/14/2022 13:34:18,Nicola Martinez Clemente,nicoenrique_1994@hotmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,nicoenrique_1994,0.0,...,,,,,,,,,NaT,NaT
4,U02G5B470B1,2022-11-15 19:07:30,"Hello, where do I found the webpacks options a...",5,11/15/2022 19:07:30,Facundo Gul dos Santos,facundogds@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,facundogds,0.0,...,11/15/2022 19:07:30,Facundo Gul dos Santos,facundogds@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,facundogds,0.0,0.0,0.0,2022-11-15 19:07:30,0 days 00:20:48


In [67]:
final_df.to_csv('../outputs/final_df.csv')