# Find the details of user registration

In [None]:
## Python packages - you may have to pip install sqlalchemy, sqlalchemy_utils, and psycopg2.
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd
import numpy as np

import re # to access database details in a file, to search for strings
import time # to time how long queries and processes take
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date, datetime, timedelta # to manipulate dates

from connect_db import connect_db
from makeWeekList import makeWeekList

% matplotlib inline

In [None]:
path='/Users/brianna/Documents/WL_DBdeets/'

In [None]:
# Access the 'events' database.
conE, curE = connect_db('evnt', path)

# Access the 'transactions' database.
conT, curT = connect_db('trns', path)

### Query the db for signup date and other information about participation

In [None]:
# Make a list of everyone who's signed up (AKA had a sign-up event)

def querySignedUp():
    starttime=time.time()

    sql_query="""
        select 
            user_id, date_trunc('day',event_time)::DATE as reg_date, u_email
        from 
            app131567.signed_up
    """

    query_result = pd.read_sql_query(sql_query,conE)

    print(time.time()-starttime)

    df=query_result.iloc[:]
    
    # Drop duplicated whole rows.
    df = df.drop_duplicates(keep='first')

    # If there are duplicated 
    df = df.drop_duplicates(subset='user_id', keep='first')
    return df


In [None]:
# Convert the registration date to a datetime object so that it can be binned by week.
def convert_to_datetime(x):
    return datetime.combine(x, datetime.min.time())

In [None]:
# Make a column of the week they signed up (to get a metric that's the same as the transactional 
# data that I have collapsed into weekly bins)

def makeRegistrationWeekColumn(df, start_date, end_date):
    #date_start = datetime.strptime(start_date, "%Y-%m-%d")
    date_start = start_date
    #current_date = datetime.now()

    #df.loc[:,'reg_week'] = current_date
    df.loc[:,'reg_week'] = None

    while date_start < end_date:
        date_end = date_start + timedelta(days=7)
        #print(str(date_start)+' '+str(date_end))
        df.loc[(df.reg_date>date_start) & (df.reg_date<date_end), 'reg_week']=date_start
        date_start += timedelta(days=7)

    return df

In [None]:
# If the address is *.edu, return "school".  Otherwise, return the domain name.
def grabDomain(x):
    if x != None:
        domain=re.findall('@(\w+).(\w+)',x)
        if domain[0][1] == 'edu':
            return 'school'
        else:
            return(domain[0][0].lower())
    else:
        return None

def grabDomainSuffix(x):
    if x != None:
        domainSuffix=re.findall('@\w+.(\w+)',x)
        return(domainSuffix[0].lower())
    else:
        return None

In [None]:
def deIdentifyEmailColumn(df):
    # Make a column with 0 and 1's for whether a person shared their email
    df['sharedEmail'] = 0
    df.loc[df.u_email.notnull(),'sharedEmail'] = 1
    
    # Find out if the email is: 
    # - personal (gmail, yahoo, hotmail)
    # - school (.edu suffix)
    # - corporate (anything else)
    df['emailDomain'] = df.u_email.apply(lambda x : grabDomain(x))
    df.loc[df.emailDomain == 'gmail', 'emailDomain'] = 'personal'
    df.loc[df.emailDomain == 'yahoo', 'emailDomain'] = 'personal'
    df.loc[df.emailDomain == 'hotmail', 'emailDomain'] = 'personal'
    df.loc[df.emailDomain == 'mail', 'emailDomain'] = 'personal'

    # Make a column that looks at all of the email addresses that are company addresses, and
    # records the number of people with the same address.  Personal and school email addresses 
    # will get labeled '1'.
    counts = df.emailDomain.value_counts()
    counts['personal'] = 1; counts['school'] = 1
    def findNumberFromCompany(x):
        if x != None:
            count = counts[x]
            return count
        else:
            return None
    
    # Make the columns that record how many people are from the same company (ie. use the same email 
    # domain) as the user.
    df['NumberSignedUpFromCompany'] = df.emailDomain.apply(lambda x : findNumberFromCompany(x))
    df['NumberSignedUpFromCompanyNoPersonalOrSchool'] = df.NumberSignedUpFromCompany[:]
    df.loc[df.emailDomain == 'personal', 'NumberSignedUpFromCompanyNoPersonalOrSchool'] = None
    df.loc[df.emailDomain == 'school','NumberSignedUpFromCompanyNoPersonalOrSchool'] = None

    companyEmail = (df.emailDomain.notnull()) & (df.emailDomain != 'personal') & (df.emailDomain != 'school')
    df.loc[companyEmail, 'emailDomain'] = 'company'
    df['emailDomainSuffix'] = df.u_email.apply(lambda x : grabDomainSuffix(x))
    
    # Drop the email address and emailDomain so there's no identifying information in the saved file.
    #df.drop('u_email', axis=1)
    df['u_email'] = None
    return df

In [None]:
# Add information on a few more types of events to the dataframe, 
# ie. how often the following things happen:
# - added_meeting
# - added_agenda_item
# - added_note
# - assigned_action_item

def addOtherEvents(eventList):
    starttime=time.time()

    for event in eventList:
    #event = eventList[0]
        print(event)
        for w in range(len(weekList)):
            week = weekList[w]
            end_date = week + timedelta(days=7)
            string1="select user_id from app131567.{event}".format(event=event)
            string2=" where lower(event_time) between  "
            string3 = " '{start_date}' and '{end_date}'".format(start_date = week, end_date = end_date)
            #print(string3)
            sql_query = string1 + string2 + string3

            query_result = pd.read_sql_query(sql_query,conE)

            # Make a column named with the name of the event that's all ones that we can
            # use later to sum and get aggregate numbers.
            columnName=event+weekStringList[w]
            columnName=columnName.replace('-','_')

            eventDF=query_result.iloc[:]
            eventDF[columnName]=1
            sumDF=eventDF.groupby('user_id').sum()

            if event == eventList[0] and w ==0:
                allEvts = sumDF
            else:
                allEvts = allEvts.join(sumDF, how = 'outer')
            #sumDF=eventDF.groupby('user_id').sum()
    return allEvts

# Main Function

In [None]:
# Query the dataframe for all people that have signed up
signups = querySignedUp()

# Convert reg_date from a date to a datetime object
signups.reg_date = signups.reg_date.apply(lambda x: convert_to_datetime(x))

# Make a list of weeks from the start of data collection until now
start_date = '2015-05-24'; end_date = '2016-01-26'
weekList, weekStringList = makeWeekList(start_date, end_date)

# Make a column in the dataframe that contains the Registration Week (cohort) of each user
signups = makeRegistrationWeekColumn(signups, weekList[0], weekList[-1])

# Make a column to indicate whether people shared their email.
# (Later this can be changed to indicate if the email is work or personal.)
signups = deIdentifyEmailColumn(signups)

# Make a separate dataframe of other events of interest (grabbed through queries)
# eventList = ['added_meeting','added_agenda_item','added_note','assigned_action_item',
#              'viewed_onboarding__tour_4_page', 'approved_google_permissions',
#              'chrome_extension_installed', 'selected_gcal_connect','selected_office_365_connect']
eventList = ['added_meeting','added_agenda_item','assigned_action_item']
allEvts = addOtherEvents(eventList)

# Combine all of the entries in allEvts with the signups dataframe
signups=signups.join(allEvts, on='user_id',how='right')

In [None]:
signups.to_csv(path+'03_UserRegistrationDateMoreDetails.csv')
print(len(signups['reg_week'].unique()))


In [None]:
signupsWithDuplicates = signups

In [None]:
signups['dummy'] = 1
plt.figure(figsize=(20,10))
sums = signups.groupby('reg_week')['dummy'].sum()
sumsDup = signupsWithDuplicates.groupby('reg_week')['dummy'].sum()

sns.set()
sns.set_style("whitegrid")

ax = pd.Series(sums, index=sums.index)
ax.plot(color='blue')

ax = pd.Series(sumsDup, index=sumsDup.index)
ax.plot(color='green')

#ax.plot.xlabel("x axis")
#ax.xaxis('Week')

plt.savefig(path+'images/SignedUpByWeek.jpg')


In [None]:
plt.figure(figsize=(20,10))
signups['reg_week'].hist(bins = len(signups['reg_week'].unique()))
#signups['reg_week'].hist()

plt.xlabel('Number of New Registrations Per Week \n (\'reg_date\' from app131567.signed_up)', size=20)

# Find whether each person has connected their calendar.

In [None]:
type(weekList[0])