In [None]:
import pandas as pd
import numpy as np
#from datetime import date, datetime, timedelta # to manipulate dates

import matplotlib.pyplot as plt
% matplotlib inline
import matplotlib
matplotlib.style.use('ggplot')
from pylab import savefig
from __future__ import division

import seaborn as sns

path='/Users/brianna/Documents/WL_DBdeets/'

In [None]:
def makeDFsubset(df, varList):
    # Not Including: 
    # added_noteWeek0 - stopped being collected in September
    # viewed_onboarding__tour_4_pageWeek - only collected starting in September
    
    df = df[varList]
    
    df['EmailCorporateVsPrivate']=0
    corpEmail = df.emailDomain == 'company'
    personalEmail = df['emailDomain'] == 'personal'
    schoolEmail = df['emailDomain'] == 'school'
    df.loc[corpEmail,'EmailCorporateVsPrivate'] = .5
    df.loc[personalEmail,'EmailCorporateVsPrivate'] = -.5
    df.loc[schoolEmail,'EmailCorporateVsPrivate'] = -.5
    
    df['dummy'] = 1
    
#    mth0retain = df.RetentionStatusRetainedMth0 > 0
#    df.loc[mth0retain,'RetentionStatusRetainedMth0'] = 1

    return df

In [None]:
def cleanVars(df, varListContinuous):
    for v in varListContinuous:
        #print(v)
        numStdDev=5
        varUpperLimit = np.mean(df[v]) + numStdDev*np.std(df[v])
        varLowerLimit = np.mean(df[v]) - numStdDev*np.std(df[v])
        
        #(df[v] > varMean - 5*varStdDev) and 
        #print(str(varUpperLimit)+', '+str(varLowerLimit))
        df.loc[df[v] > varUpperLimit, v] = varUpperLimit
        df.loc[df[v] < varLowerLimit, v] = varLowerLimit
        
    return df

In [None]:
def plotCorrMatrix(df):
    sns.set(style="white")

    # Compute the correlation matrix
    corr = df.corr()
    #print(corr)

    # Generate a mask for the upper triangle
    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(20, 18))

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3,
                square=True, xticklabels=True, yticklabels=True,
                linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
    
    savefig(path+'images/correlationMatrix.jpg')

In [None]:
def plotVarHist(df, variableName):
 
    plt.figure(figsize=(20,10))
    df[variableName].hist(bins = len(df[variableName].unique()))

    plt.xlabel(variableName, size=20)
    plt.ylabel('Count', size = 20)
    plt.savefig(path+'images/'+variableName+'Histogram.jpg')

In [None]:
def plotVarByWeek(df, variableName):
    plt.figure(figsize=(15,15))
    sums = df.groupby('registrationWeek')[variableName].sum()
    #print(sums.index[:5])
    #print(sums[:5])
    #plt.bar(df['registrationWeek'], df[variableName])
    
    sns.set()
    sns.set_style("whitegrid")
    ax = sns.barplot(x=sums.index, y=sums, label = 'medium', color='blue')
    ax.set(xlabel='Week', ylabel='Instances of '+variableName)
    
    for item in ax.get_xticklabels():
        item.set_rotation(45)
        
    #ax.savefig(path+variableName+'.png')

In [None]:
def imputeVars(df, varListContinuous):    
    for v in varListContinuous:
        totMissing = len(df.loc[df[v].isnull(),v])
        totUser = len(df[v])
        pctMissing = totMissing/totUser
        print(v+': '+str(pctMissing))
        # If less than 25% of data is missing, replace missing data with the median of the data
        # Append a '_i' on the end of the variable so you know it's been imputed.
        if pctMissing < .25:
            df[v+'_i']=df[v][:]
            df.loc[df[v+'_i'].isnull(),v+'_i']=np.mean(df[v])
    return df

# Main Function

In [None]:
# Make List of variables and take a subset of df with just these variables.
varListCategorical = ['user_id', 
                      'registrationWeek',
                      'sharedEmail',
                      'emailDomain',
                      'RetentionStatus2Levels',
                      'RetentionStatus4Levels']
varListContinuous = ['NumberSignedUpFromCompany',
                     'MeetingsTotalWeek0', 'MeetingsTotalWeek1_0diff',
                     'organizerTotalWeek0','organizerTotalWeek1_0diff',
                     'PortionOfOrganizerWeek0','PortionOfOrganizerWeek1_0diff',
                     'num_attendeesWeek0','num_attendeesWeek1_0diff',
                     'AveNumAttendeesWeek0', 'AveNumAttendeesWeek1_0diff',
                     'added_meetingWeek0', 'added_meetingWeek1_0diff',
                     'added_agenda_itemWeek0', 'added_agenda_itemWeek1_0diff',
                     'assigned_action_itemWeek0','assigned_action_itemWeek1_0diff',
                     'WeeksVisitedOutOf12']

varList = varListCategorical + varListContinuous

In [None]:
# Read in the dataframe with all of the variables of interest.
df = pd.read_csv(path+'05_RetentionMeetingsBehaviorFull.csv')

df = makeDFsubset(df, varList)

df = cleanVars(df, varListContinuous)

In [None]:
#Plot the correlation matrix
plotCorrMatrix(df)

In [None]:
for v in varListContinuous:
    plotVarHist(df,v)
    #plotVarByWeek(df, v)

In [None]:
for v in varListContinuous:
    plotVarByWeek(df, v)

In [None]:
# If less than 30% of data is missing, make a variable with imputations on missing values.
df = imputeVars(df, varListContinuous)

In [None]:
df.head(10)

In [None]:
df.to_csv(path+'06_DataFinal.csv')