# Pandas merge tutorial with cellphone usage data
In this case, we will examines three datasets that are related.
1. examine the user_usage.csv, user_device.csv and android_devices.csv:
    1. what is the shape of each data
    2. what is use_id and user_id 
    3. which variable uniquely identify each row in each dataset
    (can each user have multiple usages and multiple devices?)
2. how to get average usages for each type of devices
    1. merge user_usage with user_device
    2. compare the shape of the results with the shape of original datasets
    3. left merge, right merge, inner merge and outer merge
    4, identify origins of records in an outer merge
3. what if the merge key (primary key) is not unique?
4. exercise: get average usage for each phone brand

In [1]:
import pandas as pd
pd.set_option('max_rows', None)
pd.set_option('max_columns', None)

### read and examine data

In [2]:
user_usage = pd.read_csv("user_usage.csv")
user_device = pd.read_csv("user_device.csv")
devices = pd.read_csv("android_devices.csv")

In [3]:
for data in [user_usage, user_device, devices]:
    print(' ')
    print(data.head())
    print('the data has {} rows'.format(data.shape[0]) )
    print('the number of unique values in each row:')
    print( data.nunique().reset_index()  )

 
   outgoing_mins_per_month  outgoing_sms_per_month  monthly_mb  use_id
0                    21.97                    4.82     1557.33   22787
1                  1710.08                  136.88     7267.55   22788
2                  1710.08                  136.88     7267.55   22789
3                    94.46                   35.17      519.12   22790
4                    71.59                   79.26     1557.33   22792
the data has 240 rows
the number of unique values in each row:
                     index    0
0  outgoing_mins_per_month  178
1   outgoing_sms_per_month  169
2               monthly_mb   83
3                   use_id  240
 
   use_id  user_id platform  platform_version     device  use_type_id
0   22782    26980      ios              10.2  iPhone7,2            2
1   22783    29628  android               6.0    Nexus 5            3
2   22784    28473  android               5.1   SM-G903F            1
3   22785    15200      ios              10.2  iPhone7,2           

### merge user_usage with user_device

In [4]:
merged = pd.merge(user_usage, user_device, on='use_id')
merged.head()

Unnamed: 0,outgoing_mins_per_month,outgoing_sms_per_month,monthly_mb,use_id,user_id,platform,platform_version,device,use_type_id
0,21.97,4.82,1557.33,22787,12921,android,4.3,GT-I9505,1
1,1710.08,136.88,7267.55,22788,28714,android,6.0,SM-G930F,1
2,1710.08,136.88,7267.55,22789,28714,android,6.0,SM-G930F,1
3,94.46,35.17,519.12,22790,29592,android,5.1,D2303,1
4,71.59,79.26,1557.33,22792,28217,android,5.1,SM-G361F,1


In [5]:
merged.shape

(159, 9)

In [6]:
merged = pd.merge(user_usage, user_device, how = 'left', on='use_id')
merged.head()

Unnamed: 0,outgoing_mins_per_month,outgoing_sms_per_month,monthly_mb,use_id,user_id,platform,platform_version,device,use_type_id
0,21.97,4.82,1557.33,22787,12921.0,android,4.3,GT-I9505,1.0
1,1710.08,136.88,7267.55,22788,28714.0,android,6.0,SM-G930F,1.0
2,1710.08,136.88,7267.55,22789,28714.0,android,6.0,SM-G930F,1.0
3,94.46,35.17,519.12,22790,29592.0,android,5.1,D2303,1.0
4,71.59,79.26,1557.33,22792,28217.0,android,5.1,SM-G361F,1.0


In [7]:
merged.shape

(240, 9)

In [8]:
merged = pd.merge(user_usage, user_device, how = 'right', on='use_id')
merged.head()

Unnamed: 0,outgoing_mins_per_month,outgoing_sms_per_month,monthly_mb,use_id,user_id,platform,platform_version,device,use_type_id
0,,,,22782,26980,ios,10.2,"iPhone7,2",2
1,,,,22783,29628,android,6.0,Nexus 5,3
2,,,,22784,28473,android,5.1,SM-G903F,1
3,,,,22785,15200,ios,10.2,"iPhone7,2",3
4,,,,22786,28239,android,6.0,ONE E1003,1


In [9]:
merged.shape

(272, 9)

### duplicate merge key

In [10]:
user_usage2 = user_usage.copy()
user_device2 = user_device.copy()
user_usage2.loc[user_usage2['use_id']==22787, 'use_id']=22788
user_device2.loc[user_device2['use_id']==22787, 'use_id']=22788

In [11]:
merged = pd.merge(user_usage2, user_device2, on='use_id')
merged.head()

Unnamed: 0,outgoing_mins_per_month,outgoing_sms_per_month,monthly_mb,use_id,user_id,platform,platform_version,device,use_type_id
0,21.97,4.82,1557.33,22788,12921,android,4.3,GT-I9505,1
1,21.97,4.82,1557.33,22788,28714,android,6.0,SM-G930F,1
2,1710.08,136.88,7267.55,22788,12921,android,4.3,GT-I9505,1
3,1710.08,136.88,7267.55,22788,28714,android,6.0,SM-G930F,1
4,1710.08,136.88,7267.55,22789,28714,android,6.0,SM-G930F,1


In [12]:
merged.shape

(161, 9)

In [13]:
merged[merged['use_id']==22788]

Unnamed: 0,outgoing_mins_per_month,outgoing_sms_per_month,monthly_mb,use_id,user_id,platform,platform_version,device,use_type_id
0,21.97,4.82,1557.33,22788,12921,android,4.3,GT-I9505,1
1,21.97,4.82,1557.33,22788,28714,android,6.0,SM-G930F,1
2,1710.08,136.88,7267.55,22788,12921,android,4.3,GT-I9505,1
3,1710.08,136.88,7267.55,22788,28714,android,6.0,SM-G930F,1


### merge phone brand

In [14]:
merged1 = pd.merge(user_usage, user_device, how = 'left', on='use_id')
merged2 = pd.merge(merged1, devices[['Device', 'Retail Branding']], how = 'inner', left_on = 'device', right_on = 'Device')
merged2.head()

Unnamed: 0,outgoing_mins_per_month,outgoing_sms_per_month,monthly_mb,use_id,user_id,platform,platform_version,device,use_type_id,Device,Retail Branding
0,94.46,35.17,519.12,22790,29592.0,android,5.1,D2303,1.0,D2303,Sony
1,99.23,35.58,519.12,22854,29592.0,android,5.1,D2303,1.0,D2303,Sony
2,283.3,107.47,15573.33,22806,21615.0,android,6.0,A0001,1.0,A0001,OnePlus
3,283.3,107.47,15573.33,22806,21615.0,android,6.0,A0001,1.0,A0001,OnePlus
4,57.49,16.73,15573.33,22839,29655.0,android,6.0,A0001,1.0,A0001,OnePlus
