# Script to calculate Meetings Per Week

This code will access data for my company from their space on a RedShift cluster so that I can analyze it and look for patterns to predict both retention (who sticks around) and virality (who is more likely to get other people to use the program).

In [None]:
## Python packages - you may have to pip install sqlalchemy, sqlalchemy_utils, and psycopg2.
import sys # To input the date you want to start with

from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd

import re # to access database details in a file
import time # to time how long queries and processes take
from datetime import datetime, timedelta # to manipulate dates

from connect_db import connect_db # function I wrote to connect to the database.

#import matplotlib.pyplot as plt
#% matplotlib inline

In [None]:
path='/Users/brianna/Documents/WL_DBdeets/'

### Open the two databases where data are stored.

The 'events' database has a main table called 'events131567', which has a bunch of sub-tables with information on the events completed by the user.

The 'transactions' database has a bunch of tables with data about the user:
pg_stat_statements,
activities,
agenda_templates,
calendar_webhooks,
calendars,
contacts,
integration_authorizations,
integration_providers,
integrations,
meeting_seeds,
migrations,
notifications,
password_reset_tokens,
meetings,
short_urls,
team_invitations,
team_memberships,
teams,
users,
events,
items,
recurring_event_rules,
meeting_invitations


In [None]:
# Access the 'events' database.
conE, curE = connect_db('evnt', path)

# Access the 'transactions' database.
conT, curT = connect_db('trns', path)


### Look at the number of meetings (and total events) stored on a user's calendar in a certain week.

In [None]:
def query_db(start_date, end_date):

    #print('Start on '+str(start_date))
    #print('End on '+str(end_date))

    # Construct the SQL query to get user_id, email, title of meeting, etc.
    sql_string1 = """
    select c.user_id, u.primary_email, e.title, 
           lower(e.time_range) meeting_start, upper(e.time_range) meeting_end, 
           e.attendees
    from users u
    join 
        calendars c
    on
        u.id = c.user_id
    left outer join 
        events e 
    on 
        e.calendar_id = c.id
    where lower(e.time_range) between 
    """

    sql_string2 = "'{start_date}' and '{end_date}'".format(start_date = start_date, end_date = end_date)

    sql_query = sql_string1 + sql_string2

    #print(sql_query)

    query_result = pd.read_sql_query(sql_query,conT)
    return query_result

In [None]:
def make_df(query_result):
    # Make a temporary dataframe of all of the events in this date range
    df=query_result.iloc[:]
    df.columns
    
    # Sanity check: Only a subset of the user_id's should be unique since many users have more than 
    # one event in a week.
    #print(len(df_allevts[:]['user_id'].unique()))
    #print(len(df_allevts[:]['user_id']))

    #for i in range(10):
    #    print(len(df_allevts.attendees[i]))
    return df

In [None]:
# First make sure there are no meetings with missing information (this should be
# less than 1% of the meetings).  Then add a column that lists the number of
# attendees at the meeting.
def calculateNumAttendees(df):
    df=df[pd.isnull(df.attendees)==False]
    df['num_attendees'] = df.attendees.apply(lambda x : len(x))
    return df

In [None]:
# Make a dataframe that's a subset of the original, with only the rows where the event is a meeting
# (Meeting is defined as an event with more than one person attending)
def makeMeetingsDF(df):
    df_allmtngs = df[df.num_attendees > 1]

    print(len(df_allmtngs[:]['user_id'].unique()))
    print(len(df_allmtngs[:]['user_id']))
    #df_allmtngs.head()
    return df_allmtngs

In [None]:
# This function takes a dataframe and finds whether the organizer of an event is the same as 
# the user who has this event in their calendar
def find_organizer(df):
    try:
        for a in range(len(df['attendees'])):
            try:
                # If this list contains the 'organizer' key, continue to find out who it is.
                if df['attendees'][a]['organizer'] == True:
                    #print(df_allmtngs.attendees[x][a]['email'])
                    #print(df_allmtngs.primary_email[x])
                    
                    # If the organizer is the same as the user, change the value in the 'organizer'
                    # column to True (ie. this person organized this meeting.)
                    if df['attendees'][a]['email'] == df['primary_email']:
                        return True
                        #return df['attendees'][a]['email']
            except:
                continue
        #print(str(x)+' '+str(a)+' '+organizer)
    except:
        return False

In [None]:
# Create a column that checks whether the user was the organizer of the event.
def createOrganizerColumn(df_allmtngs):
    organizer_column = df_allmtngs.apply(lambda x : find_organizer(x), axis=1)
    df_allmtngs['organizer'] = organizer_column
    #print(df_allmtngs.columns)
    return df_allmtngs

In [None]:
def deleteExtraColumns(df_allmtngs):
    del df_allmtngs['primary_email']; del df_allmtngs['title' ]; del df_allmtngs['attendees' ]
    return df_allmtngs

# For each week, make a dataframe of de-identified information we can save.

In [None]:
# Make a list of the weeks between 2015-05-24 and now.
weekList = []
date_start = datetime.strptime('2015-05-24', "%Y-%m-%d")
current_date = datetime.now()

while date_start < current_date + timedelta(days=7):
    weekList.append(date_start)
    date_start += timedelta(days=7)

# Make an identical list but filled with the string version of the variable instead of the 
# datetime version.  (We have to use the [:] notation or else both variable names point to
# the same object.)

weekStringList=weekList[:]
for w in range(len(weekStringList)):
    weekStringList[w]=str(weekStringList[w]).split(' ')[0]

In [None]:
# Let's look at how many events on the calendar of each user.  We can't just pull a count of all of the
# events since some of these events aren't actually meetings (ie. blocked off time for other things.) To 
# figure out which events are actually meetings, let's pull all of the events, then go through and make 
# a subset of events where there is more than one attendee.

for w in WeekStringList:
    start_date = datetime.strptime(w, "%Y-%m-%d")
    end_date = start_date + timedelta(days=7)

    query_result = query_db(start_date, end_date)
    df_allevts = make_df(query_result)
    df_allevts = calculateNumAttendees(df_allevts)

    df_allmtngs = makeMeetingsDF(df_allevts)
    df_allmtngs = createOrganizerColumn(df_allmtngs)
    df_allmtngs = deleteExtraColumns(df_allmtngs)

    # The final dataframe will have a row for each event in the calendar and the following columns:
    # user_id, start time of each meeting, number of attendees in the meeting, organizer status

    #start_date = min(df_allmtngs.meeting_start)
    #end_date = max(df_allmtngs.meeting_start)
    df_allmtngs.to_csv(path+str(start_date)+'to'+str(end_date)+'Evnts_byEvnt.csv')