# Cleaning Public Support data

## Importing necessary libraries

In [1]:
pip install pandas matplotlib seaborn wordcloud

Collecting pandas
  Using cached pandas-1.5.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.2 MB)
Collecting matplotlib
  Using cached matplotlib-3.6.2-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (9.4 MB)
Collecting seaborn
  Using cached seaborn-0.12.1-py3-none-any.whl (288 kB)
Collecting wordcloud
  Using cached wordcloud-1.8.2.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (458 kB)
Collecting pytz>=2020.1
  Using cached pytz-2022.6-py2.py3-none-any.whl (498 kB)
Collecting numpy>=1.20.3
  Using cached numpy-1.23.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
Collecting fonttools>=4.22.0
  Using cached fonttools-4.38.0-py3-none-any.whl (965 kB)
Collecting kiwisolver>=1.0.1
  Using cached kiwisolver-1.4.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.2 MB)
Collecting contourpy>=1.0.1
  Using cached contourpy-1.0.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (295 kB)
Collecting pillow>=6.2.0
  Using cac

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime,date, timedelta

## Loading the data

In [3]:
slack = pd.read_csv('../sources/support-channels.csv')


## Discover

In [4]:
print('Shape of slack dataframe before cleaning:', slack.shape)

Shape of slack dataframe before cleaning: (481, 14)


In [5]:
slack.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481 entries, 0 to 480
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Channel_ID        481 non-null    object
 1   Channel_Slug      481 non-null    object
 2   Timestamp         481 non-null    object
 3   Timestamp_Thread  368 non-null    object
 4   User_ID           481 non-null    object
 5   Full_Name         470 non-null    object
 6   Email             481 non-null    object
 7   Permalink         481 non-null    object
 8   Text              481 non-null    object
 9   Text_raw          481 non-null    object
 10  Slack_username    481 non-null    object
 11  Team_ID           481 non-null    object
 12  Team_Name         481 non-null    object
 13  Is_Bot            481 non-null    bool  
dtypes: bool(1), object(13)
memory usage: 49.4+ KB


In [6]:
slack.head(3)

Unnamed: 0,Channel_ID,Channel_Slug,Timestamp,Timestamp_Thread,User_ID,Full_Name,Email,Permalink,Text,Text_raw,Slack_username,Team_ID,Team_Name,Is_Bot
0,CAZ9W99U4,public-support-full-stack,11/11/2022 15:23:37,11/11/2022 14:44:11,U0426RW6CR5,Elías Fernández,atukdibe@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,fijate si te lo toma solo con background:,fijate si te lo toma solo con background:,atukdibe,T0BFXMWMV,4Geeks Academy,False
1,CAZ9W99U4,public-support-full-stack,11/9/2022 7:52:50,,U03SWDR1KTM,Jack Caldwell-Nichols,jack.wcn@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,Hola a todos! Socorro! Quiero eliminar los fav...,Hola a todos! Socorro! Quiero eliminar los fav...,jack.wcn,T0BFXMWMV,4Geeks Academy,False
2,CAZ9W99U4,public-support-full-stack,11/11/2022 16:12:26,,U03A7FCVDMW,Richard Jardine,rhjardine@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,"Hello. Please, I require guidance regarding th...","Hello. Please, I require guidance regarding th...",rhjardine,T0BFXMWMV,4Geeks Academy,False


**Creating 2 new columns**

In [4]:
slack['Is_a_question'] = np.where(slack['Timestamp_Thread'].isnull(), 1, 0)

In [5]:
support_agents = ['1','5301']

slack['Is_agent']= np.where(slack['User_ID'].isin(support_agents),1,0)

**Converting timestamp columns**

In [6]:
slack['Datetime'] = pd.to_datetime(slack['Timestamp'])
slack['Datetime_Thread'] = pd.to_datetime(slack['Timestamp_Thread'])

**Removing timestamp difference in questions and answers**

In [7]:
slack = slack.groupby(["User_ID"]).apply(lambda x: x.sort_values(["Datetime"]))

In [8]:
slack = slack.set_index('User_ID')

In [9]:
slack.head(20)

Unnamed: 0_level_0,Channel_ID,Channel_Slug,Timestamp,Timestamp_Thread,Full_Name,Email,Permalink,Text,Text_raw,Slack_username,Team_ID,Team_Name,Is_Bot,Is_a_question,Is_agent,Datetime,Datetime_Thread
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
U01KGAER1TM,CAZ9W99U4,public-support-full-stack,11/4/2022 16:14:32,,,rmoramo1@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,"Como estan, tengo un problema, ocupo subir una...","Como estan, tengo un problema, ocupo subir una...",rmoramo1,T0BFXMWMV,4Geeks Academy,False,1,0,2022-11-04 16:14:32,NaT
U01KGAER1TM,CAZ9W99U4,public-support-full-stack,11/4/2022 17:02:51,11/4/2022 16:14:32,,rmoramo1@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,No se quiere usar un tercero para las fotos,No se quiere usar un tercero para las fotos,rmoramo1,T0BFXMWMV,4Geeks Academy,False,0,0,2022-11-04 17:02:51,2022-11-04 16:14:32
U01KGAER1TM,CAZ9W99U4,public-support-full-stack,11/4/2022 17:04:57,11/4/2022 16:14:32,,rmoramo1@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,digamos que son imagenes de usuarios en donde ...,digamos que son imagenes de usuarios en donde ...,rmoramo1,T0BFXMWMV,4Geeks Academy,False,0,0,2022-11-04 17:04:57,2022-11-04 16:14:32
U01KGAER1TM,CAZ9W99U4,public-support-full-stack,11/4/2022 17:05:02,11/4/2022 16:14:32,,rmoramo1@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,comentarios y likes,comentarios y likes,rmoramo1,T0BFXMWMV,4Geeks Academy,False,0,0,2022-11-04 17:05:02,2022-11-04 16:14:32
U01KGAER1TM,CAZ9W99U4,public-support-full-stack,11/4/2022 17:06:30,11/4/2022 16:14:32,,rmoramo1@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,y la base de datos no solo contiene las imagen...,y la base de datos no solo contiene las imagen...,rmoramo1,T0BFXMWMV,4Geeks Academy,False,0,0,2022-11-04 17:06:30,2022-11-04 16:14:32
U01KGAER1TM,CAZ9W99U4,public-support-full-stack,11/4/2022 17:26:59,11/4/2022 16:14:32,,rmoramo1@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,https://isn365.com/,<https://isn365.com/>,rmoramo1,T0BFXMWMV,4Geeks Academy,False,0,0,2022-11-04 17:26:59,2022-11-04 16:14:32
U01KGAER1TM,CAZ9W99U4,public-support-full-stack,11/4/2022 17:27:20,11/4/2022 16:14:32,,rmoramo1@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,"este es el sitio, se quiere hacer como un wall...","este es el sitio, se quiere hacer como un wall...",rmoramo1,T0BFXMWMV,4Geeks Academy,False,0,0,2022-11-04 17:27:20,2022-11-04 16:14:32
U01KGAER1TM,CAZ9W99U4,public-support-full-stack,11/4/2022 17:34:48,11/4/2022 16:14:32,,rmoramo1@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,Es que lo que se quiere es tener las imagenes ...,Es que lo que se quiere es tener las imagenes ...,rmoramo1,T0BFXMWMV,4Geeks Academy,False,0,0,2022-11-04 17:34:48,2022-11-04 16:14:32
U01KGAER1TM,CAZ9W99U4,public-support-full-stack,11/4/2022 17:38:07,11/4/2022 16:14:32,,rmoramo1@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,Voy a explorar la posibilidad de guardar las i...,Voy a explorar la posibilidad de guardar las i...,rmoramo1,T0BFXMWMV,4Geeks Academy,False,0,0,2022-11-04 17:38:07,2022-11-04 16:14:32
U01KGAER1TM,CAZ9W99U4,public-support-full-stack,11/4/2022 17:43:00,11/4/2022 16:14:32,,rmoramo1@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,"Por otro lado que es mas barato, el host o la BD","Por otro lado que es mas barato, el host o la BD",rmoramo1,T0BFXMWMV,4Geeks Academy,False,0,0,2022-11-04 17:43:00,2022-11-04 16:14:32


In [13]:
slack['Minutes Since Previous'] = (slack['Datetime'] - slack['Datetime'].shift(1)).astype('timedelta64[m]')

In [15]:
slack['Minutes Since Previous'] = slack['Minutes Since Previous'].fillna(0)

**Number of interactions per student**

**Number of questions per student**

## Data Cleaning

**Encoding boolean column**

**Self Join**

**Cleaning joined dataframe**

In [None]:
SELECT User_ID
    ,Full_Name
    ,Timestamp
    ,STUFF((SELECT ',' + value FROM table t2 WHERE t2.User_ID = t1.User_ID and t2.Full_Name = t1.Full_Name AND t2.Timestamp = t1.Timestamp FOR XML PATH('')),1,1,'') AS value
FROM table t1
GROUP BY ID,name,prop