In [1]:
import pandas as pd
from datetime import date, datetime, timedelta # to manipulate dates

import matplotlib.pyplot as plt
% matplotlib inline

from makeWeekList import makeWeekList

path='/Users/brianna/Documents/WL_DBdeets/'

In [2]:
# Add a column for how many weeks out of the 12 we're looking at did the person use the product
# We can use this later as an alternate metric of 'retention'.
def calculateWeeksVisitedOutOf12(df):
    df['WeeksVisitedOutOf12'] = 0 
  
    # Number of weeks visited out of 12
    for w in range(12):
        df['WeeksVisitedOutOf12'] += df['VisitedWeek'+str(w)]

    return df

In [3]:
def calculateRetentionStatus(df):

    # 0 means churned after Week0 (or didn't even make it to Week0)
    # 1 means churned after first week
    # 2 means churned after first month
    # 3 means still around at month 3
 
    # If users stick around through Week0, assign 1
    df['RetentionStatus4Levels'] = df.VisitedWeek0[:]
     
    # If users stick around in the last 3 weeks of the first month, assign 1
    df['RetentionStatusRetainedMth0'] = 0
    for w in range(3):
        w = w + 1
        df['RetentionStatusRetainedMth0'] += df['VisitedWeek'+str(w)]
    mth0retain = df.RetentionStatusRetainedMth0 > 0
    df.loc[mth0retain,'RetentionStatusRetainedMth0'] = 1
    df.loc[mth0retain,'RetentionStatus4Levels'] = 2

    # If users stick around in the last month, assign 1
    df['RetentionStatusRetainedMth2'] = 0
    for w in range(4):
        w = w + 8
        df['RetentionStatusRetainedMth2'] += df['VisitedWeek'+str(w)]
    mth2retain = df.RetentionStatusRetainedMth2 > 0
    df.loc[mth2retain,'RetentionStatusRetainedMth2'] = 1
    df.loc[mth2retain,'RetentionStatus4Levels'] = 3

    df['RetentionStatus2Levels'] = df.RetentionStatus4Levels[:]
    df.loc[df.RetentionStatus2Levels == 1, 'RetentionStatus2Levels'] = 0
    df.loc[df.RetentionStatus2Levels == 2, 'RetentionStatus2Levels'] = 0
    df.loc[df.RetentionStatus2Levels == 3, 'RetentionStatus2Levels'] = 1

    return df

In [4]:
def addNewColumns(df, variableList):
    for v in variableList:
        df[v] = None
        df[v+'Bin'] = None
    return df

In [5]:
# Read in the dataframe with meetings data and assign the correct information
# to each person's Week0
def calculateWeekXMeetingsInfo(df, weekList, weekStringList, X):
    for w in range(len(weekList)-X-1):
        week0 = weekStringList[w]
        date_start=weekStringList[w+X]
        date_end=weekStringList[w+X+1]
        #print(date_start)

        # Load the file with meeting information for that week
        meetings = pd.read_csv(path+date_start+'to'+date_end+'EvntsCollapsedByUser.csv')
        # Tack it on to the retention dataframe (lining up by user)
        df = pd.DataFrame.merge(df, meetings, on = 'user_id', how = 'left')

        # For the participants whose Week0 was that week, copy over the info to their Week0 columns
        # Make a boolean array of whether user is in the cohort that started at Week0 or not
        condition = df.loc[:, "registrationWeek"] == week0

        df.loc[condition, 'MeetingsTotalWeek'+str(X)] = df.loc[condition, 'MeetingsTotal']
        df.loc[condition, 'organizerTotalWeek'+str(X)] = df.loc[condition, 'organizerTotal']
        df.loc[condition, 'PortionOfOrganizerWeek'+str(X)] = df.loc[condition, 'PortionOfOrganizer']
        df.loc[condition, 'num_attendeesWeek'+str(X)] = df.loc[condition, 'num_attendees']
        df.loc[condition, 'AveNumAttendeesWeek'+str(X)] = df.loc[condition, 'AveNumAttendees'] 

        numCohort = len(df.MeetingsTotalWeek0[df.registrationWeek == week0])

        # Dropthe columns you created in the retention dataframe so they can get made again 
        # for the next week in the list.
        df = df.drop(['MeetingsTotal', 'organizerTotal', 'PortionOfOrganizer', 'num_attendees','AveNumAttendees'], axis = 1)
        
    return df

In [6]:
# Loop through weeks and fill in information on when people completed the following activities
# during their Week0 and Week1.  The variable x here is which week you want this info for.
def calculateWeekXEventsInfo(df, weekList, weekStringList, X):
    for w in range(len(weekList)-1-X):
        # Make a new Week0 for each week. Make a second variable called week_0 which is the 
        # same thing except that the dashes are underscores (since columns can't have dashes.)
        week0=weekStringList[w]
        weekX=weekStringList[w+X]
        week_X = weekX.replace('-','_')
 
        # Make a boolean array of whether user is in the cohort that started at Week0 or not
        condition = df.loc[:, 'registrationWeek'] == week0

        # For each user in the current cohort (ie. started this week, replace their Week0 data.)
        df.loc[condition, 'added_meetingWeek'+str(X)] = df.loc[condition, 'added_meeting'+week_X]
        df.loc[condition, 'added_agenda_itemWeek'+str(X)] = df.loc[condition, 'added_agenda_item'+week_X]
        #df.loc[condition, 'added_noteWeek'+str(X)] = df.loc[condition, 'added_note'+week_X]
        df.loc[condition, 'assigned_action_itemWeek'+str(X)] = df.loc[condition, 'assigned_action_item'+week_X]
        #df.loc[condition, 'viewed_onboarding__tour_4_pageWeek'+str(X)] = df.loc[condition, 'viewed_onboarding__tour_4_page'+week_X]
    
    # Where there are no entries, fill in zeroes
    zeroValues=df['added_meetingWeek'+str(X)].isnull()
    df.loc[zeroValues,'added_meetingWeek'+str(X)]=0
    zeroValues=df['added_agenda_itemWeek'+str(X)].isnull()
    df.loc[zeroValues,'added_agenda_itemWeek'+str(X)]=0
    zeroValues=df['assigned_action_itemWeek'+str(X)].isnull()
    df.loc[zeroValues,'assigned_action_itemWeek'+str(X)]=0
    
    return df

In [7]:
# Create a binary variable for each of the variables we're interested in.  (This
# will come in handy later when we want to plot counts of each variable.)
def createBinaryVariable(df, variableList, wk):

    for variableName in variableList:
        df.loc[df[variableName+str(wk)].isnull(), variableName+str(wk)+'Bin'] = 0
        df.loc[df[variableName+str(wk)].notnull(), variableName+str(wk)+'Bin'] = 1
    
    return df

In [8]:
# Calculate the difference in events from week X to week Y
def calculateWeekX_YDiff(df, variableList, X, Y):
    for v in variableList:
        df[v+str(Y)+'_'+str(X)+'diff']=df[v+str(Y)]-df[v+str(X)] 
    return df

In [9]:
def deleteUnnamedCols(df):
    df=df.drop('Unnamed: 0_x', axis=1)
    df=df.drop('Unnamed: 0_y', axis=1)
    for col in df.columns:
        if 'Unnamed' in col:
            #df = df.drop(col, axis=1)
            print(col)
    
    return df

# Main Function

Run through each function to manipulate the retention dataframe to include the variables you want.

In [10]:
# Main function.
# Read in the dataframe and go through each function to populate it.

# Read in the dataframe that has retention information for each user.
#retention = pd.read_csv(path+'Retention.csv')
retention = pd.read_csv(path+'04_UserRegistrationDateMoreDetailsRetention.csv')

# Add a variable for weeks visited out of 12
retention =  calculateWeeksVisitedOutOf12(retention)

retention = calculateRetentionStatus(retention)

# Add new columns for meeting data and events data in Week 0.
# We'll fill these columns in later.
# Add columns for information on user meetings and user intereaction with product at Week0
# variableList = ['MeetingsTotalWeek', 'organizerTotalWeek','PortionOfOrganizerWeek','num_attendeesWeek',
#                 'AveNumAttendeesWeek', 'added_meetingWeek', 'added_agenda_itemWeek', 'added_noteWeek',
#                 'assigned_action_itemWeek', 'viewed_onboarding__tour_4_pageWeek']
variableList = ['MeetingsTotalWeek', 'organizerTotalWeek','PortionOfOrganizerWeek','num_attendeesWeek',
                'AveNumAttendeesWeek', 'added_meetingWeek', 'added_agenda_itemWeek',
                'assigned_action_itemWeek']

retention = addNewColumns(retention, variableList)

# Make a list of weeks that people started (cohorts)
start_date = '2015-05-24'; end_date = '2016-01-26'
weekList, weekStringList = makeWeekList(start_date, end_date)

# Fill in weekly meetings info for each cohort's Week0 and Week1
retention = calculateWeekXMeetingsInfo(retention, weekList, weekStringList, 0)
retention = calculateWeekXMeetingsInfo(retention, weekList, weekStringList, 1)

# Fill in weekly events info for each cohort's Week0
#eventsInfo = pd.read_csv(path+'UserRegistrationDateMoreDetails.csv')
retention = calculateWeekXEventsInfo(retention, weekList, weekStringList,0)
retention = calculateWeekXEventsInfo(retention, weekList, weekStringList,1)
retention = createBinaryVariable(retention, variableList, 0)
retention = createBinaryVariable(retention, variableList, 1)

# Calculate the events velocity (difference in events from Week0 to Week1)
retention = calculateWeekX_YDiff(retention, variableList, 0, 1)

retention = deleteUnnamedCols(retention)

In [11]:
retention.to_csv(path+'05_RetentionMeetingsBehaviorFull.csv')

In [None]:
retention.head()