### Background:
Dataset of personal health tracking app including system/device information and personalized app attributes from each user and app installation.
### Purpose of this notebook:
- Exploratory data analysis
- Clean up raw data
- Add new features

In [None]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
# read Excel file and save as pickle
df = pd.read_excel('App Stats 2018-03-16.xlsx', sheetname='Sheet1')
df.to_pickle('df_2018-03-16')

In [None]:
df = pd.read_pickle('df_2018-03-16')

In [None]:
# conversion rate 0.34%, according to PM, this is average/good for this type of app
df[df['IsPaid']==True]['IsPaid'].count()/df['IsPaid'].count()

In [None]:
df.info()

In [None]:
# Visual representation of where the missing data is. This graph does not include zero values
plt.figure(figsize=(12,6))
sns.heatmap(df.isnull(), yticklabels=False, cbar=False, cmap='viridis')

# Exploratory data analysis
Which features might be used in a pricing model and/or predicting user conversion? Looking for any signals in the data.

## Device features:

### Platform
- iOS users 6x more likely to purchase product compared to Android users

In [None]:
pd.pivot_table(df, index='ReminderFrequency', values='IsPaid', aggfunc=('count','mean'))

### ShowMealTime
- whether or not this feature is chosen is a signal

In [None]:
pd.pivot_table(df, index='ShowMealTime', values='IsPaid', aggfunc=('count','mean'))

### TargetWeight
- whether or not target weight is inputted is a signal
- correlation with age
- target weight > 200 kg looks suspicious- software bug?
- minimum age required for downloading app is officially 15 years old

In [None]:
pivot_by_weight = pd.pivot_table(df, index=df['TargetWeight'].apply(lambda x: x!=0),
              values='IsPaid', aggfunc=('count', 'mean'))

pivot_by_weight

In [None]:
rate1 = pivot_by_weight.iloc[1,1]*100
print("Conversion rate, input TargetWeight: %.2f%%" %rate1)
rate2 = pivot_by_weight.iloc[0,1]*100
print("Conversion rate, DID NOT input TargetWeight: %.2f%%" %rate2)

In [None]:
plt.figure()
df[df['TargetWeight']>0]['TargetWeight'].plot.hist(bins=100)
plt.xlim(0,300)
plt.xlabel('Target weight (kg)')

In [None]:
# cumulative distribution plot
df_tw = df[df['TargetWeight']>0]['TargetWeight']
#unique_elements, counts_elements = np.unique(df_tw.values, return_counts=True)

num_bins=100
counts, bin_edges = np.histogram(df_tw.values, bins=num_bins)
cdf = np.cumsum(counts/df_tw.count())

plt.step(bin_edges[1:], cdf)
plt.xlim(-10,210)
plt.ylim(-0.05,1.05)
plt.xlabel('Target weight (kg)')
plt.ylabel('CDF')

In [None]:
df[df['TargetWeight']>0]['TargetWeight'].describe()

In [None]:
sns.lmplot(x='AgeWhenGoalsSet', y='TargetWeight', data=df, hue='IsPaid', fit_reg=False)

## Cleaning the data and adding new features

### DaysSinceInstall: all zeros, calculate # days since install including first day 

In [None]:
# list of top 20 UICultures with the most users
top_20_UICultures = df['UICulture'].value_counts().head(20).index.tolist()

In [None]:
pivot_UICulture = pd.pivot_table(df[df['UICulture'].isin(top_20_UICultures)], index='UICulture', values='IsPaid', aggfunc=('count','mean'))


In [None]:
pivot_UICulture = pivot_UICulture.reindex(pivot_UICulture['count'].sort_values(ascending=False).index)

In [None]:
pivot_UICulture['mean'] = (pivot_UICulture['mean'].round(decimals=4))*100
pivot_UICulture.columns = ['total installs','conversion rate']

In [None]:
pivot_UICulture

### Variation

In [None]:
pd.pivot_table(df, index='Variation', values='IsPaid', aggfunc=('count', 'mean'))

### LockedBottom 

In [None]:
pd.pivot_table(df, index='LockedBottom', values='IsPaid', aggfunc=('count', 'mean'))

### Session count
- even amongst paid users, large drop after 20 app sessions -- suggests app retention and/or user compliance issue

In [None]:
df[df['IsPaid']==True]['SessionCount'].plot.hist(bins=500)
plt.xlim(0,600)

### InstallDate
- number of installs jumped up at end of year/beginning of new year attributed to better download funnel (higher ratings, higher placing in search results) and seasonal differences (new year's resolutions)
- drop in conversion rate over time suggests difference between early and later users and maybe app competition
- data after 4-2018 is invalid, dataset only contains data up to 3-2018

In [None]:
df['AgeWhenGoalsSet'].plot.hist(bins=100)

In [None]:
fig, ax = plt.subplots(1,2, figsize=(10,6))
sns.distplot(df[df['AgeWhenGoalsSet'].apply(lambda x: x>0) & df['IsPaid']==True]['AgeWhenGoalsSet'],
             kde=False, bins=60, ax=ax[0])
ax[0].set_title('Paid users')

sns.distplot(df[df['AgeWhenGoalsSet'].apply(lambda x: x>0) & df['IsPaid']==False]['AgeWhenGoalsSet'],
             kde=False, bins=60, ax=ax[1])
ax[1].set_title('Non-paid users')

plt.suptitle('Age distribution for paid vs unpaid users', fontsize=16)

In [None]:
rate = df[(df['AgeWhenGoalsSet']!=0) & (df['IsPaid']==True)]['AgeWhenGoalsSet'].count()/df['AgeWhenGoalsSet'].count()*100
print("Conversion rate, input age: %.2f%%" %rate)

rate = df[(df['AgeWhenGoalsSet']==0) & (df['IsPaid']==True)]['AgeWhenGoalsSet'].count()/df['AgeWhenGoalsSet'].count()*100
print("Conversion rate, DID NOT input age: %.2f%%" %rate)

### Gender
- conversion rate (listed as mean) for males is 3x that of females or none (no response)

In [None]:
pd.pivot_table(df, index='Gender', values='IsPaid', aggfunc=('count','mean'))

### Program
- this feature does not provide a strong signal and is inherently very subjective

In [None]:
pd.pivot_table(df, index='Program', values='IsPaid', aggfunc=('count','mean'))

### Onboarding goal
- most users want to lose/maintain/gain weight

In [None]:
pd.pivot_table(df, index='OnboardingGoal', values='IsPaid', aggfunc=('count','mean'))

### ReminderFrequency

In [None]:
pivot_by_platform = pd.pivot_table(df, index='Platform', values='IsPaid', aggfunc=('count','mean','std'))
pivot_by_platform

In [None]:
# Sample size is large but we can use this as an example.
# Example: check for statistical significance using t-test
# Ho (null hypothesis) = means for Android and iOS groups are the same
# Ha (alterntive hypothesis) = means are different
# two-tailed test

In [None]:
from scipy.stats import ttest_ind, ttest_ind_from_stats

In [None]:
a = df[df['Platform']=='Android']['IsPaid']
b = df[df['Platform']=='iOS']['IsPaid']
t, p = ttest_ind(a, b, equal_var=False)
print('t = ', t)
print('p = ', p)

In [None]:
t2, p2 = ttest_ind_from_stats(pivot_by_platform.iloc[0,1], pivot_by_platform.iloc[0,2], pivot_by_platform.iloc[0,0], 
                             pivot_by_platform.iloc[1,1], pivot_by_platform.iloc[1,2], pivot_by_platform.iloc[1,0], 
                              equal_var=False)
print('t: ', t2)
print('p: ', p2)

### Free offer product
- conversion rate for those who were offered 'free' product (trial w/ full access to app features) is 0.71% compared to 0.24% for those not offered trial

In [None]:
pivot_by_offer = pd.pivot_table(df, index='OfferFree', values='IsPaid', aggfunc=('count','mean'))
pivot_by_offer['mean'] = (pivot_by_offer['mean'].round(decimals=4))*100
pivot_by_offer.columns = ['total installs','conversion rate']
pivot_by_offer

### UICulture
 - different conversion (subscription) rates for countries likely arises from difference in average app spending and relative competition (availability of similar apps in each language)

### TargetWeightInput: whether or not user input any data

In [None]:
df['TargetWeightInput'] = df['TargetWeight']/df['TargetWeight']

In [None]:
df['TargetWeightInput'].fillna(value=0, inplace=True)
df['TargetWeightInput'] = df['TargetWeightInput'].astype(bool)

In [None]:
df['TargetWeightInput'].value_counts()

### SessionsPerDay
- for paid users, appears as if most have stopped using the app (assuming active user would access app 1-5x per day)
- possible issue with recording sessions in app data
- need to define what constitues an active user

In [None]:
# SessionsPerDay
df['SessionsPerDay'] = df['SessionCount']/df['DaysSinceInstall']

In [None]:
df[df['IsPaid']==True]['SessionsPerDay'].plot.hist(bins=100)
plt.xlim(0,5)

### WeightDiff
- user's goal (weight loss or gain)

In [None]:
df['WeightDiff'] = df['TargetWeight'] - df['LastWeight']

### LastBMI and TargetBMI
- combine height and weight into one variable
- indicator of "health" or "fitness"

In [None]:
# current BMI (kg/m^2) = last weight (kg)/height^2 (m^2)
df['LastBMI'] = df['LastWeight']/((df['Height']/100)**2)

# target BMI (kg/m^2) = target weight(kg)/height^2 (m^2)
df['TargetBMI'] = df['TargetWeight']/((df['Height']/100)**2)

### AgeGroup: add new feature for age groups based on distribution
- address sparse data issue with few paid users for a given age year
- as anticipated, older users are willing to and/or have the means to pay compared to younger users

In [None]:
# split dataframe into two: 1) zero value of AgeWhenGoalsSet and 2) non-zero values of AgeWhenGoalsSet
df_age_zero = df[df['AgeWhenGoalsSet']==0]
df_age_nonzero = df[df['AgeWhenGoalsSet']!=0]

# bin age into quintiles and assign to column AgeGroup
quintiles = pd.qcut(df_age_nonzero['AgeWhenGoalsSet'], 5, labels=[1,2,3,4,5])
df_age_nonzero = df_age_nonzero.assign(AgeGroup=quintiles.values)

# change from category to int
df_age_nonzero['AgeGroup'] = df_age_nonzero['AgeGroup'].astype(int)

# set AgeGroup=0 for zero age (no input)
df_age_zero = df_age_zero.assign(AgeGroup=0)

# recombine dataframes
df = pd.concat([df_age_zero,df_age_nonzero])

In [None]:
pd.pivot_table(df, index='AgeGroup', values='IsPaid', aggfunc=('count','mean'))

### LastProduct, LastProductValue, SingleProduct, TwoProducts
- change nan values to 'not paid' or zero

In [None]:
df['LastProduct'].fillna(value='not_paid', inplace=True)
df['LastProductValue'].fillna(value=0, inplace=True)
df['SingleProduct'].fillna(value='not_offered', inplace=True)
df['TwoProducts'].fillna(value='not_offered', inplace=True)

### Save cleaned data and new features

In [None]:
df.to_pickle('df_clean-2018-03-16-newfeat')

### QProductAmtOffer: add new feature to show dollar amount offered for a Q subscription product

In [None]:
Conversion_by_date = pd.pivot_table(df, index=[df['InstallDate'].dt.year, df['InstallDate'].dt.month],
                                    values='IsPaid', aggfunc=('count', 'mean'))

In [None]:
Conversion_by_date['mean'] = (Conversion_by_date['mean'].round(decimals=4))*100

In [None]:
Conversion_by_date.columns = ['total installs', 'conversion rate']

In [None]:
Conversion_by_date

## App personalization features input by user:

### User age distribution

In [None]:
df_clean = pd.read_pickle('df_clean-2018-03-16-newfeat')

In [None]:
# save subset of dataframe with Q product as separate
df_Q = pd.DataFrame(df_clean.loc[df['SingleProduct'].str.contains('q'), df_clean.columns])

In [None]:
df_Q = df_Q.reset_index(drop=True)

In [None]:
df_Q['SingleProduct'].value_counts()

In [None]:
# The number following 'q' is the price offered.
df_Q['QProductAmtOffer']=df_Q['SingleProduct'].str.extract(r'(?<=q)(\d+)', expand=False).fillna(0).astype(int)

### QProductAmtPaid: amount paid for a Q product subscription offer

In [None]:
# If user pays, LastProduct records the subscription. 
# Find all subscriptions with 'q' and strip the number following it.

In [None]:
df_Q['QProductAmtPaid']=df_Q['LastProduct'].str.extract(r'(?<=q)(\d+)', expand=False).fillna(0).astype(int)

In [None]:
df_Q['QProductAmtPaid'].value_counts()

### Save cleaned data

In [None]:
# DaysSinceInstall does not have values
df['DaysSinceInstall'].value_counts()

In [None]:
from datetime import datetime
from datetime import timedelta

In [None]:
# include the day it was installed
df['DaysSinceInstall'] = datetime(2018,3,16) - df['InstallDate'] + timedelta(days=1)

In [None]:
# change to integer type
df['DaysSinceInstall']=(df['DaysSinceInstall'] / np.timedelta64(1, 'D')).astype(int)

In [None]:
# something wrong with InstallDate, has many dates in the future
df[df['DaysSinceInstall']<=0]['DaysSinceInstall'].count()

In [None]:
# No paid entries with negative DaysSinceInstall
df[(df['DaysSinceInstall']<=0) & (df['IsPaid']==True)]['ID'].count()

In [None]:
# Reset the entries with negative DaysSinceInstall to 1 day
df.loc[df['DaysSinceInstall']<=0, 'DaysSinceInstall']=1

In [None]:
df[df['DaysSinceInstall']<=0]['DaysSinceInstall'].count()

## Add new features
- create features that allow for comparison
- create features that indicate whether or not user input any data (Age, TargetWeight)

### AgeInput: whether or not user input any data

In [None]:
df['AgeInput']=df['AgeWhenGoalsSet']/df['AgeWhenGoalsSet']

In [None]:
df['AgeInput'].fillna(value=0, inplace=True)

In [None]:
df['AgeInput'] = df['AgeInput'].astype(bool)

In [None]:
df['AgeInput'].value_counts()