# Cleaning Public Support data

## Importing necessary libraries

In [1]:
pip install pandas matplotlib seaborn wordcloud


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime,date, timedelta

## Loading the data

In [3]:
slack = pd.read_csv('../sources/support-channels.csv')


## Discover

In [4]:
print('Shape of slack dataframe before cleaning:', slack.shape)

Shape of slack dataframe before cleaning: (481, 14)


In [5]:
slack.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481 entries, 0 to 480
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Channel_ID        481 non-null    object
 1   Channel_Slug      481 non-null    object
 2   Timestamp         481 non-null    object
 3   Timestamp_Thread  368 non-null    object
 4   User_ID           481 non-null    object
 5   Full_Name         470 non-null    object
 6   Email             481 non-null    object
 7   Permalink         481 non-null    object
 8   Text              481 non-null    object
 9   Text_raw          481 non-null    object
 10  Slack_username    481 non-null    object
 11  Team_ID           481 non-null    object
 12  Team_Name         481 non-null    object
 13  Is_Bot            481 non-null    bool  
dtypes: bool(1), object(13)
memory usage: 49.4+ KB


**Creating 2 new columns**

In [6]:
slack['Is_a_question'] = np.where(slack['Timestamp_Thread'].isnull(), 1, 0)

In [7]:
support_agents = ['1','5301']

slack['Is_agent']= np.where(slack['User_ID'].isin(support_agents),1,0)

**Converting timestamp columns**

In [8]:
slack['Datetime'] = pd.to_datetime(slack['Timestamp'])
slack['Datetime_Thread'] = pd.to_datetime(slack['Timestamp_Thread'])

**Creating 2 dataframes: questions and answers**

In [9]:
questions_df = slack[slack['Is_a_question'] == 1]
answers_df = slack[slack['Is_a_question'] == 0]

In [10]:
answers = answers_df.groupby(['Channel_ID','User_ID','Datetime'])[['Text']]

In [11]:
df3 = pd.DataFrame(answers.sum().reset_index())

In [12]:
df3.head()

Unnamed: 0,Channel_ID,User_ID,Datetime,Text
0,CAZ9W99U4,U01KGAER1TM,2022-11-04 17:02:51,No se quiere usar un tercero para las fotos
1,CAZ9W99U4,U01KGAER1TM,2022-11-04 17:04:57,digamos que son imagenes de usuarios en donde ...
2,CAZ9W99U4,U01KGAER1TM,2022-11-04 17:05:02,comentarios y likes
3,CAZ9W99U4,U01KGAER1TM,2022-11-04 17:06:30,y la base de datos no solo contiene las imagen...
4,CAZ9W99U4,U01KGAER1TM,2022-11-04 17:26:59,https://isn365.com/


In [13]:
df3['difference'] = (df3.sort_values('Datetime').groupby('User_ID').Datetime.diff())

In [14]:
df3['difference'] = df3['difference'].fillna(pd.Timedelta(seconds=0))

In [15]:
df3['difference']=df3['difference']/np.timedelta64(1,'s')

In [16]:
df3.rename(columns = {'difference':'diff_in_seconds'}, inplace = True)
   

In [17]:
df3.head(15)

Unnamed: 0,Channel_ID,User_ID,Datetime,Text,diff_in_seconds
0,CAZ9W99U4,U01KGAER1TM,2022-11-04 17:02:51,No se quiere usar un tercero para las fotos,0.0
1,CAZ9W99U4,U01KGAER1TM,2022-11-04 17:04:57,digamos que son imagenes de usuarios en donde ...,126.0
2,CAZ9W99U4,U01KGAER1TM,2022-11-04 17:05:02,comentarios y likes,5.0
3,CAZ9W99U4,U01KGAER1TM,2022-11-04 17:06:30,y la base de datos no solo contiene las imagen...,88.0
4,CAZ9W99U4,U01KGAER1TM,2022-11-04 17:26:59,https://isn365.com/,1229.0
5,CAZ9W99U4,U01KGAER1TM,2022-11-04 17:27:20,"este es el sitio, se quiere hacer como un wall...",21.0
6,CAZ9W99U4,U01KGAER1TM,2022-11-04 17:34:48,Es que lo que se quiere es tener las imagenes ...,448.0
7,CAZ9W99U4,U01KGAER1TM,2022-11-04 17:38:07,Voy a explorar la posibilidad de guardar las i...,199.0
8,CAZ9W99U4,U01KGAER1TM,2022-11-04 17:43:00,"Por otro lado que es mas barato, el host o la BD",293.0
9,CAZ9W99U4,U01KGAER1TM,2022-11-04 17:43:39,creo que por ese lado es una buena opcion porq...,39.0


In [18]:
df3 = df3.assign(diff_abs= df3.diff_in_seconds.abs())

In [27]:
def create_messageid():
    for group in df3.groupby(['User_ID']):
        df3['messageId'] = df3.diff_abs.diff().gt(300).cumsum() + 1 + df3.same_author.cumsum()

create_messageid()



  for group in df3.groupby(['User_ID']):


In [None]:
def create_Message_ID():
    for group in df3.groupby(['User_ID']):

        threshold = pd.Timedelta(seconds=300)

        df3['Message_ID'] = [df3.loc[(df3['Datetime'] - t).abs() < threshold, ''].sum()
                            for t in df3['Datetime']]

In [23]:
df3['same_author'] = df3['User_ID'].ne(df3['User_ID'].shift().bfill()).astype(int)

In [28]:
df3.head(30)

Unnamed: 0,Channel_ID,User_ID,Datetime,Text,diff_in_seconds,diff_abs,messageId,same_author
0,CAZ9W99U4,U01KGAER1TM,2022-11-04 17:02:51,No se quiere usar un tercero para las fotos,0.0,0.0,1,0
1,CAZ9W99U4,U01KGAER1TM,2022-11-04 17:04:57,digamos que son imagenes de usuarios en donde ...,126.0,126.0,1,0
2,CAZ9W99U4,U01KGAER1TM,2022-11-04 17:05:02,comentarios y likes,5.0,5.0,1,0
3,CAZ9W99U4,U01KGAER1TM,2022-11-04 17:06:30,y la base de datos no solo contiene las imagen...,88.0,88.0,1,0
4,CAZ9W99U4,U01KGAER1TM,2022-11-04 17:26:59,https://isn365.com/,1229.0,1229.0,2,0
5,CAZ9W99U4,U01KGAER1TM,2022-11-04 17:27:20,"este es el sitio, se quiere hacer como un wall...",21.0,21.0,2,0
6,CAZ9W99U4,U01KGAER1TM,2022-11-04 17:34:48,Es que lo que se quiere es tener las imagenes ...,448.0,448.0,3,0
7,CAZ9W99U4,U01KGAER1TM,2022-11-04 17:38:07,Voy a explorar la posibilidad de guardar las i...,199.0,199.0,3,0
8,CAZ9W99U4,U01KGAER1TM,2022-11-04 17:43:00,"Por otro lado que es mas barato, el host o la BD",293.0,293.0,3,0
9,CAZ9W99U4,U01KGAER1TM,2022-11-04 17:43:39,creo que por ese lado es una buena opcion porq...,39.0,39.0,3,0


In [21]:
# Timestamps satisfying given condition
'''
for i in range(len(df3)):
    for x in df3['User_ID]:
        if (df3['diff_in_seconds'][i] > 0 AND df3['diff_in_seconds'][i] < 300) :
            df3['Text'][i-1] + ' ' + df3['Text'][i]) 
'''

"\nfor i in range(len(df3)):\n    for x in df3['User_ID]:\n        if (df3['diff_in_seconds'][i] > 0 AND df3['diff_in_seconds'][i] < 300) :\n            df3['Text'][i-1] + ' ' + df3['Text'][i]) \n"

**Number of interactions per student**

**Number of questions per student**

## Data Cleaning

**Encoding boolean column**

**Cleaning joined dataframe**