In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import json
import matplotlib.pyplot as plt

PATH = Path('data')
list(PATH.iterdir())

[WindowsPath('data/cc_info.csv'),
 WindowsPath('data/conversion_data.csv'),
 WindowsPath('data/employee_retention_data.csv'),
 WindowsPath('data/Fraud_Data.csv'),
 WindowsPath('data/Fraud_Data_Country.csv'),
 WindowsPath('data/IpAddress_to_Country.csv'),
 WindowsPath('data/song.json'),
 WindowsPath('data/test_table.csv'),
 WindowsPath('data/user_table.csv')]

In [58]:
def get_data():
    df = pd.read_json(PATH/'song.json')
    df['signup_day'] = pd.to_datetime(df.user_sign_up_date).dt.day
    df['time_played'] = pd.to_datetime(df.time_played)
    
    df.drop(['id','user_sign_up_date'],axis=1,inplace=True)
    df.song_played = df.song_played.astype('category')
    df.user_state = df.user_state.astype('category')
    
    df.sort_values(['time_played','signup_day','user_id'],inplace=True)
    df.reset_index(drop=True,inplace=True)
    return df

In [59]:
df = get_data()
df.shape

(4000, 5)

In [63]:
df.dtypes

song_played          category
time_played    datetime64[ns]
user_id                 int64
user_state           category
signup_day              int64
dtype: object

In [60]:
df.tail(10)

Unnamed: 0,song_played,time_played,user_id,user_state,signup_day
3990,Ticket to Ride,2015-06-28 22:21:53,189,Florida,20
3991,Let It Be,2015-06-28 22:22:25,14,Ohio,2
3992,While My Guitar Gently Weeps,2015-06-28 22:31:14,91,North Carolina,10
3993,Strawberry Fields Forever,2015-06-28 22:36:10,146,Indiana,18
3994,A Day In The Life,2015-06-28 22:54:54,150,Oregon,18
3995,Revolution,2015-06-28 22:58:23,195,Alaska,20
3996,Paperback Writer,2015-06-28 22:59:27,189,Florida,20
3997,We Can Work It Out,2015-06-28 23:12:51,137,Wisconsin,17
3998,Come Together,2015-06-28 23:26:38,158,North Carolina,19
3999,Lucy In The Sky With Diamonds,2015-06-28 23:46:06,96,South Carolina,12


In [88]:
df.user_id.nunique()

196

In [96]:
df[['user_id','signup_day']].drop_duplicates().shape[0]
# user_id and signup_day is one to one

196

In [97]:
df[['user_id','user_state']].drop_duplicates().shape[0]
# user_id and user_state is one to one

196

What are the top 3 and the bottom 3 states in terms of number of users?

In [98]:
df.groupby(['user_state']).user_id.nunique().nlargest(3)

user_state
New York      23
California    21
Texas         15
Name: user_id, dtype: int64

In [104]:
df.groupby(['user_state']).user_id.nunique().nsmallest(9)
# there are these many states with only 1 user

user_state
Arizona         1
Connecticut     1
Idaho           1
Iowa            1
Kansas          1
Nebraska        1
New Mexico      1
North Dakota    1
Rhode Island    1
Name: user_id, dtype: int64

What are the top 3 and the bottom 3 states in terms of user engagement? You can choose how to mathematically define user engagement. What the CEO cares about here is in which states users are using the product a lot/very little.

Define user engagement for each state as (# of time the site is used to play a song / # of user). If this == 1 -> each user only uses the site once -> low engagement

In [113]:
df_state_eng = pd.concat([df.groupby('user_state').size(),df.groupby(['user_state']).user_id.nunique()],axis=1)
df_state_eng.columns=['total_records','total_users']
df_state_eng.head()

Unnamed: 0_level_0,total_records,total_users
user_state,Unnamed: 1_level_1,Unnamed: 2_level_1
Alabama,104,4
Alaska,58,2
Arizona,22,1
Arkansas,34,2
California,425,21


In [115]:
df_state_eng['engagement'] = df_state_eng.total_records / df_state_eng.total_users
df_state_eng.sort_values('engagement',ascending=False,inplace=True)
df_state_eng.head()

Unnamed: 0_level_0,total_records,total_users,engagement
user_state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Nebraska,36,1,36.0
Alaska,58,2,29.0
Mississippi,85,3,28.333333
South Carolina,85,3,28.333333
Rhode Island,27,1,27.0


In [117]:
df_state_eng.head(3)

Unnamed: 0_level_0,total_records,total_users,engagement
user_state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Nebraska,36,1,36.0
Alaska,58,2,29.0
Mississippi,85,3,28.333333


In [118]:
df_state_eng.tail(3)

Unnamed: 0_level_0,total_records,total_users,engagement
user_state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Minnesota,42,4,10.5
Virginia,17,2,8.5
Kansas,8,1,8.0


First user to sign up for each state

In [194]:
def get_first_user(df):
    idx = df['signup_day'].idxmin()
    return df.loc[idx, 'user_id']

In [195]:
# temp=df.groupby(['user_state','user_id']).signup_day.min()
df.groupby('user_state').apply(get_first_user)

user_state
Alabama             5
Alaska            106
Arizona           105
Arkansas           78
California         39
Colorado          173
Connecticut       127
Florida            41
Georgia            20
Idaho             165
Illinois           45
Indiana           102
Iowa              178
Kansas            177
Kentucky           34
Louisiana          50
Maryland           18
Massachusetts      15
Michigan           13
Minnesota           8
Mississippi        23
Missouri           85
Nebraska          134
New Jersey          6
New Mexico          4
New York           12
North Carolina      2
North Dakota      135
Ohio                3
Oklahoma          119
Oregon              1
Pennsylvania       11
Rhode Island      174
South Carolina     64
Tennessee          70
Texas               7
Utah               29
Virginia          142
Washington        125
West Virginia      60
Wisconsin          32
dtype: int64