In [59]:
%%capture
pip install numpy pandas

In [60]:
import numpy as np
import pandas as pd

#### **Add user_id to slack dataframe**

In [61]:
df_slack = pd.read_csv('../sources/support_channels.csv')

In [62]:
df_slack.columns

Index(['Channel_ID', 'Channel_Slug', 'Timestamp', 'Timestamp_Thread',
       'User_ID', 'Full_Name', 'Email', 'Permalink', 'Text', 'Text_raw',
       'Slack_username', 'Team_ID', 'Team_Name', 'Is_Bot'],
      dtype='object')

In [63]:
auth_user = pd.read_csv('../sources/auth_user.csv')

In [64]:
auth_user.columns

Index(['id', 'first_name', 'last_name', 'email', 'date_joined'], dtype='object')

In [65]:
# Remove unnecessary columns
auth_user = auth_user.drop(columns=['first_name', 'last_name', 'date_joined'])

In [66]:
slack_with_user_id = df_slack.merge(auth_user, left_on='Email', right_on='email', how='left')

In [67]:
slack_with_user_id.columns

Index(['Channel_ID', 'Channel_Slug', 'Timestamp', 'Timestamp_Thread',
       'User_ID', 'Full_Name', 'Email', 'Permalink', 'Text', 'Text_raw',
       'Slack_username', 'Team_ID', 'Team_Name', 'Is_Bot', 'id', 'email'],
      dtype='object')

#### **Add slugs to cohort_users**

In [68]:
cohort_users = pd.read_csv('../sources/cohort_users.csv')

In [69]:
cohort_users.columns

Index(['id', 'role', 'finantial_status', 'educational_status', 'created_at',
       'updated_at', 'cohort_id', 'user_id', 'watching'],
      dtype='object')

In [70]:
# Consider only rows with student as role
cohort_users = cohort_users[cohort_users['role'].str.lower()=='student']

In [71]:
# Remove unnecessary columns
cohort_users = cohort_users.drop(columns=['id', 'role', 'finantial_status', 'educational_status', 'created_at', 'updated_at', 'watching'])

In [72]:
cohorts = pd.read_csv('../sources/cohorts.csv')

In [73]:
cohorts.columns

Index(['id', 'slug', 'name', 'kickoff_date', 'ending_date', 'current_day',
       'stage', 'language', 'created_at', 'updated_at', 'academy_id',
       'timezone', 'private', 'never_ends', 'schedule_id',
       'syllabus_version_id', 'online_meeting_url', 'remote_available',
       'current_module', 'history_log', 'is_hidden_on_prework'],
      dtype='object')

In [74]:
# Remove unnecessary columns
cohorts = cohorts.drop(columns=['name', 'kickoff_date', 'ending_date', 'current_day', 'language', 
       'created_at', 'updated_at', 'academy_id', 'timezone', 'private', 'schedule_id',
       'syllabus_version_id', 'online_meeting_url', 'remote_available', 'current_module', 
       'history_log', 'is_hidden_on_prework'])

In [75]:
# Consider only cohorts with stage ACTIVE ¿¿¿¿¿¿¿¿or STARTED????????
cohorts = cohorts[cohorts['stage'].isin(['ACTIVE', 'STARTED'])]

In [76]:
# Consider only cohorts that end
cohorts = cohorts[cohorts['never_ends']==False]

In [77]:
cohort_users_with_slug = cohort_users.merge(cohorts, left_on='cohort_id', right_on='id', how='left')

In [78]:
cohort_users_with_slug

Unnamed: 0,cohort_id,user_id,id,slug,stage,never_ends
0,9,14,,,,
1,31,14,,,,
2,4,14,,,,
3,9,15,,,,
4,9,16,,,,
...,...,...,...,...,...,...
6887,397,5545,397.0,miami-45,STARTED,False
6888,61,6004,,,,
6889,523,3617,,,,
6890,61,6005,,,,


In [79]:
# Concatenate all slugs associated with the same user_id
cohort_users_with_slug['slug'] = cohort_users_with_slug['slug'].replace({np.nan: None})
grouped_df = cohort_users_with_slug.groupby('user_id').agg({'slug': lambda x: ', '.join([str(i) for i in x if i is not None])}).reset_index()
grouped_df['slug'] = grouped_df['slug'].replace({'':np.nan})

In [80]:
# Add suffixes when merging for easy identification
cohort_users_with_slug = cohort_users_with_slug.merge(grouped_df, on='user_id', suffixes=('_simple', '_concatenated'))

In [81]:
# Remove unnecessary columns
cohort_users_with_slug = cohort_users_with_slug.drop(columns=['cohort_id', 'id', 'stage', 'never_ends', 'slug_simple'])

In [82]:
# Rename column
cohort_users_with_slug = cohort_users_with_slug.rename(columns={'slug_concatenated':'slug'})

In [83]:
# Remove duplicates, so there is a single row per user_id, containing all slugs associated with each user_id
cohort_users_with_slug = cohort_users_with_slug.drop_duplicates()

#### **Merge both dataframes so each message has the cohorts slugs associated to the student**

In [84]:
slack_with_slug = slack_with_user_id.merge(cohort_users_with_slug, left_on='id', right_on='user_id', how='left')

In [85]:
# Remove unnecessary columns
slack_with_slug = slack_with_slug.drop(columns=['id', 'email', 'user_id'])

In [86]:
slack_with_slug.columns

Index(['Channel_ID', 'Channel_Slug', 'Timestamp', 'Timestamp_Thread',
       'User_ID', 'Full_Name', 'Email', 'Permalink', 'Text', 'Text_raw',
       'Slack_username', 'Team_ID', 'Team_Name', 'Is_Bot', 'slug'],
      dtype='object')

In [87]:
slack_with_slug[slack_with_slug['slug'].notnull()]

Unnamed: 0,Channel_ID,Channel_Slug,Timestamp,Timestamp_Thread,User_ID,Full_Name,Email,Permalink,Text,Text_raw,Slack_username,Team_ID,Team_Name,Is_Bot,slug
14,CAZ9W99U4,public-support-full-stack,11/7/2022 21:15:26,,U01SM5J4MMG,Alvaro Javier Chagas Capurro,alvarojavierchagas@hotmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,"Buenas tardes, intenta poniendo solo git push","Buenas tardes, intenta poniendo solo git push",alvarojavierchagas,T0BFXMWMV,4Geeks Academy,False,"pre-work-latam, latam-pt-12"
15,CAZ9W99U4,public-support-full-stack,11/8/2022 14:09:39,11/7/2022 21:15:26,U01SM5J4MMG,Alvaro Javier Chagas Capurro,alvarojavierchagas@hotmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,BUen dia pudiste solucionar??,BUen dia pudiste solucionar??,alvarojavierchagas,T0BFXMWMV,4Geeks Academy,False,"pre-work-latam, latam-pt-12"
16,CAZ9W99U4,public-support-full-stack,11/8/2022 14:12:03,11/7/2022 21:15:26,U01SM5J4MMG,Alvaro Javier Chagas Capurro,alvarojavierchagas@hotmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,Podrias pasarme la captura de nuevo que no me ...,Podrias pasarme la captura de nuevo que no me ...,alvarojavierchagas,T0BFXMWMV,4Geeks Academy,False,"pre-work-latam, latam-pt-12"
20,CAZ9W99U4,public-support-full-stack,11/15/2022 19:07:30,,U02G5B470B1,Facundo Gul dos Santos,facundogds@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,"Hello, where do I found the webpacks options a...","Hello, where do I found the webpacks options a...",facundogds,T0BFXMWMV,4Geeks Academy,False,pre-work-latam
21,CAZ9W99U4,public-support-full-stack,11/15/2022 19:28:18,11/15/2022 19:07:30,U02G5B470B1,Facundo Gul dos Santos,facundogds@gmail.com,https://4geeksacademy.slack.com/archives/CAZ9W...,"Thanks, mate, it's was so simple but I've forg...","Thanks, mate, it's was so simple but I've forg...",facundogds,T0BFXMWMV,4Geeks Academy,False,pre-work-latam
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
408,CAZ9W99U4,public-support-full-stack,12/29/2022 9:05:46,12/28/2022 13:01:35,UMQH1S494,Marco Gonzalo Gómez Pérez,mgomez@4geeks.co,https://4geeksacademy.slack.com/archives/CAZ9W...,@aalejo Could you help us here?,<@UL08NNSV8> Could you help us here?,marcogonzalo,T0BFXMWMV,4Geeks Academy,False,fake-ml-cohort
409,CAZ9W99U4,public-support-full-stack,12/29/2022 17:53:58,12/28/2022 13:01:35,UMQH1S494,Marco Gonzalo Gómez Pérez,mgomez@4geeks.co,https://4geeksacademy.slack.com/archives/CAZ9W...,"@juanrepeto this is a message, it is not the s...","<@U03DP0YAHHV> this is a message, it is not th...",marcogonzalo,T0BFXMWMV,4Geeks Academy,False,fake-ml-cohort
410,CAZ9W99U4,public-support-full-stack,12/29/2022 19:24:35,12/28/2022 13:01:35,UMQH1S494,Marco Gonzalo Gómez Pérez,mgomez@4geeks.co,https://4geeksacademy.slack.com/archives/CAZ9W...,It seems you are not pointing your entrypoint ...,It seems you are not pointing your entrypoint ...,marcogonzalo,T0BFXMWMV,4Geeks Academy,False,fake-ml-cohort
411,CAZ9W99U4,public-support-full-stack,12/29/2022 19:43:25,12/29/2022 17:52:20,UMQH1S494,Marco Gonzalo Gómez Pérez,mgomez@4geeks.co,https://4geeksacademy.slack.com/archives/CAZ9W...,Lo que necesitas está en el contenido de la pl...,Lo que necesitas está en el contenido de la pl...,marcogonzalo,T0BFXMWMV,4Geeks Academy,False,fake-ml-cohort
