# Cleaning Public Support data

## Importing necessary libraries

In [1]:
pip install pandas matplotlib seaborn wordcloud


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime,date, timedelta

## Loading the data

In [3]:
slack = pd.read_csv('../sources/support-channels.csv')


## Discover

In [4]:
print('Shape of slack dataframe before cleaning:', slack.shape)

Shape of slack dataframe before cleaning: (481, 14)


In [5]:
slack.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481 entries, 0 to 480
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Channel_ID        481 non-null    object
 1   Channel_Slug      481 non-null    object
 2   Timestamp         481 non-null    object
 3   Timestamp_Thread  368 non-null    object
 4   User_ID           481 non-null    object
 5   Full_Name         470 non-null    object
 6   Email             481 non-null    object
 7   Permalink         481 non-null    object
 8   Text              481 non-null    object
 9   Text_raw          481 non-null    object
 10  Slack_username    481 non-null    object
 11  Team_ID           481 non-null    object
 12  Team_Name         481 non-null    object
 13  Is_Bot            481 non-null    bool  
dtypes: bool(1), object(13)
memory usage: 49.4+ KB


**Creating 2 new columns**

In [6]:
slack['Is_a_question'] = np.where(slack['Timestamp_Thread'].isnull(), 1, 0)

In [7]:
support_agents = ['1','5301']

slack['Is_agent'] = np.where(slack['User_ID'].isin(support_agents), 1, 0)

**Encoding necessary columns**

In [8]:
slack['Is_Bot'] = np.where(slack['Is_Bot'] == True, 1, 0)

**Converting timestamp columns**

In [9]:
slack['Datetime'] = pd.to_datetime(slack['Timestamp'])
slack['Datetime_Thread'] = pd.to_datetime(slack['Timestamp_Thread'])

**Creating 2 dataframes: questions and answers**

In [10]:
questions_df = slack[slack['Is_a_question'] == 1]
answers_df = slack[slack['Is_a_question'] == 0]

**Working on Answers dataframe**

In [11]:
answers = answers_df.groupby(['User_ID','Datetime'])[['Text']]

In [12]:
df2 = pd.DataFrame(answers.sum().reset_index())

In [13]:
df2.head()

Unnamed: 0,User_ID,Datetime,Text
0,U01KGAER1TM,2022-11-04 17:02:51,No se quiere usar un tercero para las fotos
1,U01KGAER1TM,2022-11-04 17:04:57,digamos que son imagenes de usuarios en donde ...
2,U01KGAER1TM,2022-11-04 17:05:02,comentarios y likes
3,U01KGAER1TM,2022-11-04 17:06:30,y la base de datos no solo contiene las imagen...
4,U01KGAER1TM,2022-11-04 17:26:59,https://isn365.com/


In [14]:
df2['Diff_in_Seconds'] = (df2.sort_values('Datetime').groupby('User_ID').Datetime.diff())

In [15]:
df2['Diff_in_Seconds'] = df2['Diff_in_Seconds'].fillna(pd.Timedelta(seconds=0))

In [16]:
df2['Diff_in_Seconds'] = df2['Diff_in_Seconds']/np.timedelta64(1,'s')

In [17]:
df2 = df2.assign(diff_abs = df2.Diff_in_Seconds.abs())

In [18]:
df2['same_author'] = df2['User_ID'].ne(df2['User_ID'].shift().bfill()).astype(int)

In [19]:
def create_AnswerId():
    for group in df2.groupby(['User_ID']):
        df2['messageId'] = df2.diff_abs.diff().gt(300).cumsum() + 1 + df2.same_author.cumsum()

create_AnswerId()

  for group in df2.groupby(['User_ID']):


In [20]:
df2.shape

(368, 7)

In [21]:
df2.head(2)

Unnamed: 0,User_ID,Datetime,Text,Diff_in_Seconds,diff_abs,same_author,messageId
0,U01KGAER1TM,2022-11-04 17:02:51,No se quiere usar un tercero para las fotos,0.0,0.0,0,1
1,U01KGAER1TM,2022-11-04 17:04:57,digamos que son imagenes de usuarios en donde ...,126.0,126.0,0,1


**Questions dataframe**

In [22]:
questions = questions_df.groupby(['User_ID','Datetime'])[['Text']]

In [23]:
df3 = pd.DataFrame(questions.sum().reset_index())

In [24]:
df3['Diff_in_Seconds'] = (df3.sort_values('Datetime').groupby('User_ID').Datetime.diff())

In [25]:
df3['Diff_in_Seconds'] = df3['Diff_in_Seconds'].fillna(pd.Timedelta(seconds=0))

In [26]:
df3['Diff_in_Seconds']=df3['Diff_in_Seconds']/np.timedelta64(1,'s')

In [27]:
df3 = df3.assign(diff_abs = df3.Diff_in_Seconds.abs())

In [28]:
df3['same_author'] = df3['User_ID'].ne(df3['User_ID'].shift().bfill()).astype(int)

In [29]:
def create_QuestionId():
    for group in df3.groupby(['User_ID']):
        df3['messageId'] = df3.diff_abs.diff().gt(300).cumsum() + 1 + df3.same_author.cumsum()

create_QuestionId()

  for group in df3.groupby(['User_ID']):


In [30]:
df3.shape

(112, 7)

In [31]:
df3.head(30)

Unnamed: 0,User_ID,Datetime,Text,Diff_in_Seconds,diff_abs,same_author,messageId
0,U01KGAER1TM,2022-11-04 16:14:32,"Como estan, tengo un problema, ocupo subir una...",0.0,0.0,0,1
1,U01SJ480RBR,2022-11-07 16:59:39,Hello,0.0,0.0,1,2
2,U01SM5J4MMG,2022-11-07 21:15:26,"Buenas tardes, intenta poniendo solo git push",0.0,0.0,1,3
3,U02N1P8CV6W,2022-10-31 22:55:36,"Hola chicos, he estado intentando llevar a pro...",0.0,0.0,1,4
4,U02NE11UHNC,2022-11-03 22:28:10,have someone imported custom fonts (no google ...,0.0,0.0,1,5
5,U02NE11UHNC,2022-11-03 22:29:25,i know this path is not correct but i have tri...,75.0,75.0,0,5
6,U02PQKM7VQF,2022-10-29 01:14:41,"hey everyone, I am trying to pass the input in...",0.0,0.0,1,6
7,U02PQKM7VQF,2022-10-30 03:31:44,Hey everyone! I am trying to style the validat...,94623.0,94623.0,0,7
8,U02PQKM7VQF,2022-10-30 22:42:02,"hey everyone, trying to deploy to heroku, push...",69018.0,69018.0,0,7
9,U02PQKM7VQF,2022-10-30 22:46:20,"hey everyone, trying to deploy to heroku, push...",258.0,258.0,0,7


In [32]:
# Timestamps satisfying given condition
'''
for i in range(len(df3)):
    for x in df3['User_ID]:
        if (df3['diff_in_seconds'][i] > 0 AND df3['diff_in_seconds'][i] < 300) :
            df3['Text'][i-1] + ' ' + df3['Text'][i]) 
'''

"\nfor i in range(len(df3)):\n    for x in df3['User_ID]:\n        if (df3['diff_in_seconds'][i] > 0 AND df3['diff_in_seconds'][i] < 300) :\n            df3['Text'][i-1] + ' ' + df3['Text'][i]) \n"

In [33]:
'''
def create_Message_ID():
    for group in df3.groupby(['User_ID']):

        threshold = pd.Timedelta(seconds=300)

        df3['Message_ID'] = [df3.loc[(df3['Datetime'] - t).abs() < threshold, ''].sum()
                            for t in df3['Datetime']]
'''

"\ndef create_Message_ID():\n    for group in df3.groupby(['User_ID']):\n\n        threshold = pd.Timedelta(seconds=300)\n\n        df3['Message_ID'] = [df3.loc[(df3['Datetime'] - t).abs() < threshold, ''].sum()\n                            for t in df3['Datetime']]\n"

## Data Cleaning

**Cleaning joined dataframe**