In [1]:
## Python packages - you may have to pip install sqlalchemy, sqlalchemy_utils, and psycopg2.
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd

import re # to access database details in a file
import time # to time how long queries and processes take
import matplotlib.pyplot as plt
from datetime import date, datetime, timedelta # to manipulate dates

from connect_db import connect_db

% matplotlib inline

In [2]:
path='/Users/brianna/Documents/WL_DBdeets/'

In [3]:
# Access the 'events' database.
conE, curE = connect_db('evnt', path)

# Access the 'transactions' database.
conT, curT = connect_db('trns', path)

I connected to the evnt database!!
I connected to the trns database!!


## Query the events database to find out which users visited the site on which weeks

In [4]:
# List tables in the Transactional (Production) database 'dd8dik067j3tun'

# Retention analysis: cohort retention analysis, look for sql examples online
# Define a cohort as people who did a certain action or behavior
# 90%  of people that signed up viewed internal page in the first week
# Look at weekly retention (how many people viewed an internal page in the following weeks after signing up)
# 30% is good for growing company, facebook has 50%, WL wants at least 20%


In [5]:
# Make a list of the weeks between 2015-05-24 and now.
weekList = []
date_start = datetime.strptime('2015-05-24', "%Y-%m-%d")
current_date = datetime.now()

while date_start < current_date + timedelta(days=7):
    weekList.append(date_start)
    date_start += timedelta(days=7)

# Make an identical list but filled with the string version of the variable instead of the 
# datetime version.  (We have to use the [:] notation or else both variable names point to
# the same object.)

weekStringList=weekList[:]
for w in range(len(weekStringList)):
    weekStringList[w]=str(weekStringList[w]).split(' ')[0]

print(weekList)
print(weekStringList)

[datetime.datetime(2015, 5, 24, 0, 0), datetime.datetime(2015, 5, 31, 0, 0), datetime.datetime(2015, 6, 7, 0, 0), datetime.datetime(2015, 6, 14, 0, 0), datetime.datetime(2015, 6, 21, 0, 0), datetime.datetime(2015, 6, 28, 0, 0), datetime.datetime(2015, 7, 5, 0, 0), datetime.datetime(2015, 7, 12, 0, 0), datetime.datetime(2015, 7, 19, 0, 0), datetime.datetime(2015, 7, 26, 0, 0), datetime.datetime(2015, 8, 2, 0, 0), datetime.datetime(2015, 8, 9, 0, 0), datetime.datetime(2015, 8, 16, 0, 0), datetime.datetime(2015, 8, 23, 0, 0), datetime.datetime(2015, 8, 30, 0, 0), datetime.datetime(2015, 9, 6, 0, 0), datetime.datetime(2015, 9, 13, 0, 0), datetime.datetime(2015, 9, 20, 0, 0), datetime.datetime(2015, 9, 27, 0, 0), datetime.datetime(2015, 10, 4, 0, 0), datetime.datetime(2015, 10, 11, 0, 0), datetime.datetime(2015, 10, 18, 0, 0), datetime.datetime(2015, 10, 25, 0, 0), datetime.datetime(2015, 11, 1, 0, 0), datetime.datetime(2015, 11, 8, 0, 0), datetime.datetime(2015, 11, 15, 0, 0), datetime.dat

In [6]:
# Make a dict with lists of unique users that visited the site in each particular week.

UniqueUserByWeek = {}

start_time = time.time()

# Go through all of the weeks except for the last one since it's not finished so you'll get an error.
for w in range(len(weekList)):
    date_start=str(weekList[w]).split(' ')[0]
    date_end = weekList[w] + timedelta(days=7)
    date_end=str(date_end).split(' ')[0]

    string1="""
        select 
            distinct user_id
        from
            app131567.viewed_meeting_details_page
        where 
            event_time between 
    """
    string2="'{date_start}' and '{date_end}'".format(date_start=date_start, date_end=date_end) 

    sql_query = string1 + string2

    query_result = pd.read_sql_query(sql_query,conE)
    
    # Add an entry to the dict where the key is the week and the value is the 
    # list of users that accessed the site that week
    UniqueUserByWeek[date_start] = query_result['user_id'].tolist()
    #print(sql_query)

total_time =  time.time() - start_time
print('Total time for queries: '+str(total_time))

Total time for queries: 4.70790576935


In [7]:
# Make a dataframe with an entry for user_id and registrationWeek, then a variable for
# whether the user visited in each of the 8 weeks following registration.
# Later, we're going to add a variable to track whether this user visited on this week.

retention = pd.read_csv(path+'UserRegistrationDate.csv')

# Delete columns we don't need
del retention['reg_date']
del retention['Unnamed: 0']

# Make a variable called registrationWeek which has the week the user registered in the form of a string
retention['registrationWeek']=retention.reg_week.apply(lambda x : str(x).split(' ')[0])

# Make variables that will have 1 or zero depending on if the user visited the site that week.
# 1 = visited, 0 = didn't visit
for x in range(12):
    retention['VisitedWeek'+str(x)] = 0

# Ay ay ay, get rid of that weird '2016-01-26 10:19:51.044020' entry.  Not sure how it got there, 
# but it's stamped from this morning.
retention = retention[retention.reg_week != '2016-01-26 10:19:51.044020']

print(retention.head(5))

                    user_id                    reg_week registrationWeek  \
0  556ce8dc1407c9df346e54dc  2015-05-31 00:00:00.000000       2015-05-31   
1  556cf3dfb78b730beae27960  2015-05-31 00:00:00.000000       2015-05-31   
2  556cf8fa913df923867886bd  2015-05-31 00:00:00.000000       2015-05-31   
3  556d11ac6e81ef883374705b  2015-05-31 00:00:00.000000       2015-05-31   
4  556d397dc23812601a3bbcdf  2015-05-31 00:00:00.000000       2015-05-31   

   VisitedWeek0  VisitedWeek1  VisitedWeek2  VisitedWeek3  VisitedWeek4  \
0             0             0             0             0             0   
1             0             0             0             0             0   
2             0             0             0             0             0   
3             0             0             0             0             0   
4             0             0             0             0             0   

   VisitedWeek5  VisitedWeek6  VisitedWeek7  VisitedWeek8  VisitedWeek9  \
0             0  

In [8]:
# Go 8 weeks out from registration week (don't worry about anything after that.)

# Query from  2015-05-31, fill it in as Week 0 for people who registered that week.
# Query from  2015-06-07, fill it in as Week 0 for people who registered that week 
#                                   and Week 1 for people that registered the week before.
# Etc.

# This is kind of clunky, but I need to re-define the function each time we go to a new 
# week since the value of x (the number of weeks since registration) changes in each
# iteration but I can't pass that dumb argument through the apply function.
for x in range(12):

    print(x)
    def check_visited(retention):
        # Find which week this person started (as indexed in the weekStringList), then
        # return 1 if the person visited the page that week and 0 if not.
        #if retention['registrationWeek'] in weekStringList:
        weekIndex = weekStringList.index(retention['registrationWeek'])
        if weekIndex+x < len(weekStringList):
            w = weekStringList[weekIndex+x]
            if retention['user_id'] in UniqueUserByWeek[w]:
                return 1
            else:
                return 0

    retention['VisitedWeek'+str(x)]=retention.apply(lambda x : check_visited(x), axis=1)
    
retention.head(10)

0
1
2
3
4
5
6
7
8
9
10
11


Unnamed: 0,user_id,reg_week,registrationWeek,VisitedWeek0,VisitedWeek1,VisitedWeek2,VisitedWeek3,VisitedWeek4,VisitedWeek5,VisitedWeek6,VisitedWeek7,VisitedWeek8,VisitedWeek9,VisitedWeek10,VisitedWeek11
0,556ce8dc1407c9df346e54dc,2015-05-31 00:00:00.000000,2015-05-31,1,0,0,0,0,1,1,0,1,0,1,1
1,556cf3dfb78b730beae27960,2015-05-31 00:00:00.000000,2015-05-31,1,0,0,0,0,0,0,0,0,0,1,0
2,556cf8fa913df923867886bd,2015-05-31 00:00:00.000000,2015-05-31,1,0,1,0,0,0,0,0,0,0,0,0
3,556d11ac6e81ef883374705b,2015-05-31 00:00:00.000000,2015-05-31,1,0,0,0,0,0,0,0,0,0,0,0
4,556d397dc23812601a3bbcdf,2015-05-31 00:00:00.000000,2015-05-31,1,0,0,0,0,0,0,0,0,0,0,0
5,556d461aa5c38aa5376721ad,2015-05-31 00:00:00.000000,2015-05-31,1,0,0,0,0,0,0,0,0,0,0,0
6,556d4980760bdfb3fec1f557,2015-05-31 00:00:00.000000,2015-05-31,1,1,1,1,1,0,0,1,0,1,1,0
7,556d4b7d3366d9c5c4387d4b,2015-05-31 00:00:00.000000,2015-05-31,1,0,0,0,0,0,0,0,0,0,0,0
8,556d4f3bdec804de5243c71e,2015-05-31 00:00:00.000000,2015-05-31,1,1,1,0,1,1,0,1,0,0,0,0
9,556d589b80e12813f3871cf0,2015-05-31 00:00:00.000000,2015-05-31,1,1,0,0,0,1,0,0,0,0,0,0


In [9]:
print(retention.head(10))

retention.to_csv(path+'Retention.csv')

                    user_id                    reg_week registrationWeek  \
0  556ce8dc1407c9df346e54dc  2015-05-31 00:00:00.000000       2015-05-31   
1  556cf3dfb78b730beae27960  2015-05-31 00:00:00.000000       2015-05-31   
2  556cf8fa913df923867886bd  2015-05-31 00:00:00.000000       2015-05-31   
3  556d11ac6e81ef883374705b  2015-05-31 00:00:00.000000       2015-05-31   
4  556d397dc23812601a3bbcdf  2015-05-31 00:00:00.000000       2015-05-31   
5  556d461aa5c38aa5376721ad  2015-05-31 00:00:00.000000       2015-05-31   
6  556d4980760bdfb3fec1f557  2015-05-31 00:00:00.000000       2015-05-31   
7  556d4b7d3366d9c5c4387d4b  2015-05-31 00:00:00.000000       2015-05-31   
8  556d4f3bdec804de5243c71e  2015-05-31 00:00:00.000000       2015-05-31   
9  556d589b80e12813f3871cf0  2015-05-31 00:00:00.000000       2015-05-31   

   VisitedWeek0  VisitedWeek1  VisitedWeek2  VisitedWeek3  VisitedWeek4  \
0             1             0             0             0             0   
1            