In [1]:
%%capture
pip install numpy pandas

In [2]:
import numpy as np
import pandas as pd

#### **Add user_id to slack dataframe**

In [3]:
df_slack = pd.read_csv('../sources/support_channels.csv')

In [4]:
df_slack.columns

Index(['Channel_ID', 'Channel_Slug', 'Timestamp', 'Timestamp_Thread',
       'User_ID', 'Full_Name', 'Email', 'Permalink', 'Text', 'Text_raw',
       'Slack_username', 'Team_ID', 'Team_Name', 'Is_Bot'],
      dtype='object')

In [5]:
auth_user = pd.read_csv('../sources/auth_user.csv')

In [6]:
auth_user.columns

Index(['id', 'first_name', 'last_name', 'email', 'date_joined'], dtype='object')

In [7]:
# Remove unnecessary columns
auth_user = auth_user.drop(columns=['first_name', 'last_name', 'date_joined'])

In [8]:
slack_with_user_id = df_slack.merge(auth_user, left_on='Email', right_on='email', how='left')

In [9]:
slack_with_user_id.columns

Index(['Channel_ID', 'Channel_Slug', 'Timestamp', 'Timestamp_Thread',
       'User_ID', 'Full_Name', 'Email', 'Permalink', 'Text', 'Text_raw',
       'Slack_username', 'Team_ID', 'Team_Name', 'Is_Bot', 'id', 'email'],
      dtype='object')

#### **Add slugs to cohort_users**

In [10]:
cohort_users = pd.read_csv('../sources/cohort_users.csv')

In [11]:
cohort_users.columns

Index(['id', 'role', 'finantial_status', 'educational_status', 'created_at',
       'updated_at', 'cohort_id', 'user_id', 'watching'],
      dtype='object')

In [12]:
# Consider only rows with student as role
cohort_users = cohort_users[cohort_users['role'].str.lower()=='student']

In [13]:
# Remove unnecessary columns
cohort_users = cohort_users.drop(columns=['id', 'role', 'finantial_status', 'educational_status', 'created_at', 'updated_at', 'watching'])

In [14]:
cohorts = pd.read_csv('../sources/cohorts.csv')

In [15]:
cohorts.columns

Index(['id', 'slug', 'name', 'kickoff_date', 'ending_date', 'current_day',
       'stage', 'language', 'created_at', 'updated_at', 'academy_id',
       'timezone', 'private', 'never_ends', 'schedule_id',
       'syllabus_version_id', 'online_meeting_url', 'remote_available',
       'current_module', 'history_log', 'is_hidden_on_prework'],
      dtype='object')

In [16]:
# Remove unnecessary columns
cohorts = cohorts.drop(columns=['name', 'kickoff_date', 'ending_date', 'current_day', 'language', 
       'created_at', 'updated_at', 'academy_id', 'timezone', 'private', 'schedule_id',
       'syllabus_version_id', 'online_meeting_url', 'remote_available', 'current_module', 
       'history_log', 'is_hidden_on_prework'])

In [17]:
# Consider only cohorts with stage ACTIVE ¿¿¿¿¿¿¿¿or STARTED????????
cohorts = cohorts[cohorts['stage'].isin(['ACTIVE', 'STARTED'])]

In [18]:
# Consider only cohorts that end
cohorts = cohorts[cohorts['never_ends']==False]

In [19]:
cohort_users_with_slug = cohort_users.merge(cohorts, left_on='cohort_id', right_on='id', how='left')

In [20]:
cohort_users_with_slug

Unnamed: 0,cohort_id,user_id,id,slug,stage,never_ends
0,9,14,,,,
1,31,14,,,,
2,4,14,,,,
3,9,15,,,,
4,9,16,,,,
...,...,...,...,...,...,...
6887,397,5545,397.0,miami-45,STARTED,False
6888,61,6004,,,,
6889,523,3617,,,,
6890,61,6005,,,,


In [21]:
# Concatenate all slugs associated with the same user_id
cohort_users_with_slug['slug'] = cohort_users_with_slug['slug'].replace({np.nan: None})
grouped_df = cohort_users_with_slug.groupby('user_id').agg({'slug': lambda x: ', '.join([str(i) for i in x if i is not None])}).reset_index()
grouped_df['slug'] = grouped_df['slug'].replace({'':np.nan})

In [22]:
# Add suffixes when merging for easy identification
cohort_users_with_slug = cohort_users_with_slug.merge(grouped_df, on='user_id', suffixes=('_simple', '_concatenated'))

In [23]:
# Remove unnecessary columns
cohort_users_with_slug = cohort_users_with_slug.drop(columns=['cohort_id', 'id', 'stage', 'never_ends', 'slug_simple'])

In [24]:
# Rename column
cohort_users_with_slug = cohort_users_with_slug.rename(columns={'slug_concatenated':'slug'})

In [25]:
# Remove duplicates, so there is a single row per user_id, containing all slugs associated with each user_id
cohort_users_with_slug = cohort_users_with_slug.drop_duplicates()

#### **Merge both dataframes so each message has the cohorts slugs associated to the student**

In [26]:
slack_with_slug = slack_with_user_id.merge(cohort_users_with_slug, left_on='id', right_on='user_id', how='left')

In [27]:
# Remove unnecessary columns
slack_with_slug = slack_with_slug.drop(columns=['id', 'email', 'user_id'])

In [28]:
slack_with_slug.columns

Index(['Channel_ID', 'Channel_Slug', 'Timestamp', 'Timestamp_Thread',
       'User_ID', 'Full_Name', 'Email', 'Permalink', 'Text', 'Text_raw',
       'Slack_username', 'Team_ID', 'Team_Name', 'Is_Bot', 'slug'],
      dtype='object')