# Cleaning Public Support data

## Importing necessary libraries

In [1]:
pip install pandas matplotlib seaborn wordcloud

Collecting pandas
  Downloading pandas-1.5.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.2/12.2 MB[0m [31m95.6 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hCollecting matplotlib
  Downloading matplotlib-3.6.2-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (9.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m124.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting seaborn
  Downloading seaborn-0.12.1-py3-none-any.whl (288 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m288.2/288.2 kB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting wordcloud
  Downloading wordcloud-1.8.2.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (458 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m459.0/459.0 kB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m
Collecting numpy>=1.20.3
  Downloading n

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime,date, timedelta

## Loading the data

In [3]:
slack = pd.read_csv('../sources/support-channels.csv')


## Discover

In [4]:
print('Shape of slack dataframe before cleaning:', slack.shape)

Shape of slack dataframe before cleaning: (481, 14)


In [5]:
slack.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481 entries, 0 to 480
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Channel_ID        481 non-null    object
 1   Channel_Slug      481 non-null    object
 2   Timestamp         481 non-null    object
 3   Timestamp_Thread  368 non-null    object
 4   User_ID           481 non-null    object
 5   Full_Name         470 non-null    object
 6   Email             481 non-null    object
 7   Permalink         481 non-null    object
 8   Text              481 non-null    object
 9   Text_raw          481 non-null    object
 10  Slack_username    481 non-null    object
 11  Team_ID           481 non-null    object
 12  Team_Name         481 non-null    object
 13  Is_Bot            481 non-null    bool  
dtypes: bool(1), object(13)
memory usage: 49.4+ KB


**Creating 2 new columns**

In [6]:
slack['Is_a_question'] = np.where(slack['Timestamp_Thread'].isnull(), 1, 0)

In [7]:
support_agents = ['1','5301']

slack['Is_agent'] = np.where(slack['User_ID'].isin(support_agents), 1, 0)

**Encoding necessary columns**

In [8]:
slack['Is_Bot'] = np.where(slack['Is_Bot'] == True, 1, 0)

**Converting timestamp columns**

In [9]:
slack['Datetime'] = pd.to_datetime(slack['Timestamp'])
slack['Datetime_Thread'] = pd.to_datetime(slack['Timestamp_Thread'])

**Creating 2 dataframes: questions and answers**

In [10]:
questions_df = slack[slack['Is_a_question'] == 1]
answers_df = slack[slack['Is_a_question'] == 0]

**Working on Answers dataframe**

In [11]:
answers = answers_df.groupby(['User_ID','Datetime'])[['Text']]

In [12]:
df2 = pd.DataFrame(answers.sum().reset_index())

In [13]:
df2.head()

Unnamed: 0,User_ID,Datetime,Text
0,U01KGAER1TM,2022-11-04 17:02:51,No se quiere usar un tercero para las fotos
1,U01KGAER1TM,2022-11-04 17:04:57,digamos que son imagenes de usuarios en donde ...
2,U01KGAER1TM,2022-11-04 17:05:02,comentarios y likes
3,U01KGAER1TM,2022-11-04 17:06:30,y la base de datos no solo contiene las imagen...
4,U01KGAER1TM,2022-11-04 17:26:59,https://isn365.com/


In [14]:
df2['Diff_in_Seconds'] = (df2.sort_values('Datetime').groupby('User_ID').Datetime.diff())

In [15]:
df2['Diff_in_Seconds'] = df2['Diff_in_Seconds'].fillna(pd.Timedelta(seconds=0))

In [16]:
df2['Diff_in_Seconds'] = df2['Diff_in_Seconds']/np.timedelta64(1,'s')

In [17]:
df2['diff_abs'] = df2.Diff_in_Seconds.abs()

In [18]:
df2['same_author'] = df2['User_ID'].ne(df2['User_ID'].shift().bfill()).astype(int)

In [19]:
def create_AnswerId(df):
    for group in df.groupby(['User_ID']):
        df['messageId'] = df.diff_abs.gt(300).cumsum() + 1 + df.same_author.cumsum()
    return df
create_AnswerId(df2)

  for group in df.groupby(['User_ID']):


Unnamed: 0,User_ID,Datetime,Text,Diff_in_Seconds,diff_abs,same_author,messageId
0,U01KGAER1TM,2022-11-04 17:02:51,No se quiere usar un tercero para las fotos,0.0,0.0,0,1
1,U01KGAER1TM,2022-11-04 17:04:57,digamos que son imagenes de usuarios en donde ...,126.0,126.0,0,1
2,U01KGAER1TM,2022-11-04 17:05:02,comentarios y likes,5.0,5.0,0,1
3,U01KGAER1TM,2022-11-04 17:06:30,y la base de datos no solo contiene las imagen...,88.0,88.0,0,1
4,U01KGAER1TM,2022-11-04 17:26:59,https://isn365.com/,1229.0,1229.0,0,2
...,...,...,...,...,...,...,...
363,UU409472Q,2022-11-11 00:32:33,No te preocupes no es algo que afecte,632178.0,632178.0,0,190
364,UU409472Q,2022-11-11 01:48:09,https://www.notion.so/4geeksacademy/How-to-del...,4536.0,4536.0,0,191
365,UU409472Q,2022-11-11 14:33:32,"Meri , sigue estos pasos https://www.notion.so...",45923.0,45923.0,0,192
366,UU409472Q,2022-11-11 15:27:50,:anguished: intenta ir al dashboard a ver,3258.0,3258.0,0,193


In [20]:
df2.shape

(368, 7)

In [21]:
answers_df.shape

(368, 18)

**Questions dataframe**

In [22]:
questions = questions_df.groupby(['User_ID','Datetime'])[['Text']]

In [23]:
df3 = pd.DataFrame(questions.sum().reset_index())

In [24]:
df3.head()

Unnamed: 0,User_ID,Datetime,Text
0,U01KGAER1TM,2022-11-04 16:14:32,"Como estan, tengo un problema, ocupo subir una..."
1,U01SJ480RBR,2022-11-07 16:59:39,Hello
2,U01SM5J4MMG,2022-11-07 21:15:26,"Buenas tardes, intenta poniendo solo git push"
3,U02N1P8CV6W,2022-10-31 22:55:36,"Hola chicos, he estado intentando llevar a pro..."
4,U02NE11UHNC,2022-11-03 22:28:10,have someone imported custom fonts (no google ...


In [25]:
df3.shape

(112, 3)

In [26]:
df3['Diff_in_Seconds'] = (df3.sort_values('Datetime').groupby('User_ID').Datetime.diff())

In [27]:
df3['Diff_in_Seconds'] = df3['Diff_in_Seconds'].fillna(pd.Timedelta(seconds=0))

In [28]:
df3['Diff_in_Seconds']=df3['Diff_in_Seconds']/np.timedelta64(1,'s')

In [29]:
df3['diff_abs'] = df3.Diff_in_Seconds.abs()

In [30]:
df3['same_author'] = df3['User_ID'].ne(df3['User_ID'].shift().bfill()).astype(int)

In [31]:
def create_QuestionId(df):
    for group in df.groupby(['User_ID']):
        df['messageId'] = df['diff_abs'].gt(300).cumsum() + 1 + df.same_author.cumsum()
    return df

create_QuestionId(df3)

  for group in df.groupby(['User_ID']):


Unnamed: 0,User_ID,Datetime,Text,Diff_in_Seconds,diff_abs,same_author,messageId
0,U01KGAER1TM,2022-11-04 16:14:32,"Como estan, tengo un problema, ocupo subir una...",0.0,0.0,0,1
1,U01SJ480RBR,2022-11-07 16:59:39,Hello,0.0,0.0,1,2
2,U01SM5J4MMG,2022-11-07 21:15:26,"Buenas tardes, intenta poniendo solo git push",0.0,0.0,1,3
3,U02N1P8CV6W,2022-10-31 22:55:36,"Hola chicos, he estado intentando llevar a pro...",0.0,0.0,1,4
4,U02NE11UHNC,2022-11-03 22:28:10,have someone imported custom fonts (no google ...,0.0,0.0,1,5
...,...,...,...,...,...,...,...
107,U04A6KV066M,2022-11-08 20:08:45,"Hi all,",0.0,0.0,1,89
108,U04A6KV066M,2022-11-08 20:13:24,Hi all. I am trying to split my view to where ...,279.0,279.0,0,89
109,U04A6KV066M,2022-11-08 21:58:27,Should we be doing anything with “commit” and ...,6303.0,6303.0,0,90
110,U6MR8LG4Q,2022-11-05 03:11:30,Hey @betomasia12 No need to make the pictures ...,0.0,0.0,1,91


In [32]:
df3.shape

(112, 7)

In [33]:
df3.head(5)

Unnamed: 0,User_ID,Datetime,Text,Diff_in_Seconds,diff_abs,same_author,messageId
0,U01KGAER1TM,2022-11-04 16:14:32,"Como estan, tengo un problema, ocupo subir una...",0.0,0.0,0,1
1,U01SJ480RBR,2022-11-07 16:59:39,Hello,0.0,0.0,1,2
2,U01SM5J4MMG,2022-11-07 21:15:26,"Buenas tardes, intenta poniendo solo git push",0.0,0.0,1,3
3,U02N1P8CV6W,2022-10-31 22:55:36,"Hola chicos, he estado intentando llevar a pro...",0.0,0.0,1,4
4,U02NE11UHNC,2022-11-03 22:28:10,have someone imported custom fonts (no google ...,0.0,0.0,1,5


**Next steps**

In [34]:
#next steps: 
# 
# 1.verify in questions_df that they match exactly with df3 (as shape has chenged by 1 row)---->randomly verify some questions
# 2.merge df2 oand df3 to some of the columns of its previous df. (https://stackoverflow.com/questions/17978133/python-pandas-merge-only-certain-columns)
# 3. concat text values according to this:
# https://www.statology.org/pandas-combine-rows-with-same-column-value/
# https://stackoverflow.com/questions/27298178/concatenate-strings-from-several-rows-using-pandas-groupby
# 4. download both datasets and take to Looker to verify they are ok to build graphs

**Merge each dataframe to its previous columns**

In [35]:
df2 = df2.merge(answers_df, how = 'left', left_on = ['User_ID', 'Datetime', 'Text'],
    right_on = ['User_ID', 'Datetime', 'Text']).drop(['Diff_in_Seconds','diff_abs','same_author','Text_raw'], axis=1)

In [36]:
df2['response_time'] = df2['Datetime'] - df2['Datetime_Thread']

In [37]:
df2.shape

(368, 19)

In [38]:
df3 = df3.merge(questions_df, how = 'left', left_on = ['User_ID', 'Datetime', 'Text'],
    right_on = ['User_ID', 'Datetime', 'Text']).drop(['Diff_in_Seconds','diff_abs','same_author','Text_raw'], axis=1)

In [39]:
df3.shape

(112, 18)

**Merge text and timestamps in rows that have the same messageId**

In [40]:
df2['Text'] = df2.groupby(['messageId'])['Text'].transform(lambda x : ' '.join(x))
df3['Text'] = df3.groupby(['messageId'])['Text'].transform(lambda x : ' '.join(x))

In [41]:
df2['Timestamp'] = df2.groupby(['messageId'])['Timestamp'].transform(lambda x : ','.join(x))
df2['Timestamp_Thread'] = df2.groupby(['messageId'])['Timestamp_Thread'].transform(lambda x : ','.join(x))
df3.dropna(axis=1, how='all', inplace=True)
df3['Timestamp'] = df3.groupby(['messageId'])['Timestamp'].transform(lambda x : ','.join(map(str, x)))

In [42]:
#rename to ids in both dataframes
df2.rename(columns={"Timestamp": "answer_id", "Timestamp_Thread": "question_id"}, inplace=True)
df3.rename(columns={"Timestamp": "question_id"}, inplace=True)

In [43]:
df2 = df2.drop_duplicates(subset=["Text", "answer_id", "question_id"],keep='first')
df3 = df3.drop_duplicates(subset=["Text","question_id"],keep='first')

In [44]:
#create a list from df3 question_id column
question_ids = df3['question_id'].tolist()

In [None]:
correct_question_id = []

for row in df2.itertuples(index=False):
    if re.match(str(row.question_id), question_ids):
    #if str(row.question_id) in question_ids:  
        id_search = str(row.question_id)
        id_match = list(filter(lambda x: id_search in x, question_ids))
        correct_question_id.append(id_match)

In [45]:
#create a dictionary from ts?thread in answers
#if doesnt work, create a list and use matching code

#https://stackoverflow.com/questions/49902412/replace-values-in-dataframe-column-if-second-column-matches-a-given-list-pandas