In [5]:
import pandas as pd
user_usage= pd.read_csv('https://raw.githubusercontent.com/shanealynn/Pandas-Merge-Tutorial/master/user_usage.csv')
user_device=pd.read_csv('https://raw.githubusercontent.com/shanealynn/Pandas-Merge-Tutorial/master/user_device.csv')
devices=pd.read_csv('https://raw.githubusercontent.com/shanealynn/Pandas-Merge-Tutorial/master/android_devices.csv')

In [6]:
devices.columns

Index(['Retail Branding', 'Marketing Name', 'Device', 'Model'], dtype='object')

In [8]:
devices.rename(columns={'Retail Branding': "manufacturer"},inplace=True)
devices.columns

Index(['manufacturer', 'Marketing Name', 'Device', 'Model'], dtype='object')

In [9]:
user_usage

Unnamed: 0,outgoing_mins_per_month,outgoing_sms_per_month,monthly_mb,use_id
0,21.97,4.82,1557.33,22787
1,1710.08,136.88,7267.55,22788
2,1710.08,136.88,7267.55,22789
3,94.46,35.17,519.12,22790
4,71.59,79.26,1557.33,22792
...,...,...,...,...
235,260.66,68.44,896.96,25008
236,97.12,36.50,2815.00,25040
237,355.93,12.37,6828.09,25046
238,632.06,120.46,1453.16,25058


In [10]:
user_device

Unnamed: 0,use_id,user_id,platform,platform_version,device,use_type_id
0,22782,26980,ios,10.2,"iPhone7,2",2
1,22783,29628,android,6.0,Nexus 5,3
2,22784,28473,android,5.1,SM-G903F,1
3,22785,15200,ios,10.2,"iPhone7,2",3
4,22786,28239,android,6.0,ONE E1003,1
...,...,...,...,...,...,...
267,23049,29725,android,6.0,SM-G900F,1
268,23050,29726,ios,10.2,"iPhone7,2",3
269,23051,29726,ios,10.2,"iPhone7,2",3
270,23052,29727,ios,10.1,"iPhone8,4",3


In [11]:
devices

Unnamed: 0,manufacturer,Marketing Name,Device,Model
0,,,AD681H,Smartfren Andromax AD681H
1,,,FJL21,FJL21
2,,,T31,Panasonic T31
3,,,hws7721g,MediaPad 7 Youth 2
4,3Q,OC1020A,OC1020A,OC1020A
...,...,...,...,...
14541,pendo,PNDPP44QC10,PNDPP44QC10,PNDPP44QC10
14542,pendo,PNDPP44QC7,PNDPP44QC7,PNDPP44QC7
14543,sugar_aums,QPOINT,QPI-1,QPI-1
14544,tecmobile,OmnisOne,OmnisOne,Omnis One


# First merge
We're trying to get the average usage figures for different types of devices. So we need to get the user's device code from user_usage as a column on user_usage, and then get the device's manufacturer from devices as a column on the result.

First, we merge user_usage with user_device with "use_id" as our common column

In [12]:
result = pd.merge(user_usage,   user_device[['use_id', 'platform', 'device']],  on='use_id')
result.head()

Unnamed: 0,outgoing_mins_per_month,outgoing_sms_per_month,monthly_mb,use_id,platform,device
0,21.97,4.82,1557.33,22787,android,GT-I9505
1,1710.08,136.88,7267.55,22788,android,SM-G930F
2,1710.08,136.88,7267.55,22789,android,SM-G930F
3,94.46,35.17,519.12,22790,android,D2303
4,71.59,79.26,1557.33,22792,android,SM-G361F


In [15]:
print("user_usage dimensions: {}".format(user_usage.shape))
print("user_device dimensions: {}".format(user_device.shape))
print("result dimensions: {}".format(result.shape))

user_usage dimensions: (240, 4)
user_device dimensions: (272, 6)
result dimensions: (159, 6)


**Is the above result correct?**

In [20]:
user_usage.use_id.value_counts()

23039    1
23548    1
22816    1
22815    1
23838    1
        ..
22922    1
22921    1
22920    1
24455    1
23040    1
Name: use_id, Length: 240, dtype: int64

In [21]:
result.use_id.value_counts()

23039    1
22864    1
22876    1
22875    1
22874    1
        ..
22955    1
22951    1
22950    1
22949    1
23040    1
Name: use_id, Length: 159, dtype: int64

# Left merge example
A left merge, or left join, between two dataframes keeps all of the rows and values from the left dataframe, in this case "user_usage". Rows from the right dataframe will be kept in the result only where there is a match in the merge variable in the right dataframe, and NaN values will be in the result where not.

In [23]:
result = pd.merge(user_usage,
                 user_device[['use_id', 'platform', 'device']],
                 on='use_id', how='left')
print("user_usage dimensions: {}".format(user_usage.shape))
print("result dimensions: {}".format(result.shape))


user_usage dimensions: (240, 4)
result dimensions: (240, 6)


In [24]:
print("There are {} missing values in the result.".format(result['device'].isnull().sum()))

There are 81 missing values in the result.


In [26]:
result.head()

Unnamed: 0,outgoing_mins_per_month,outgoing_sms_per_month,monthly_mb,use_id,platform,device
0,21.97,4.82,1557.33,22787,android,GT-I9505
1,1710.08,136.88,7267.55,22788,android,SM-G930F
2,1710.08,136.88,7267.55,22789,android,SM-G930F
3,94.46,35.17,519.12,22790,android,D2303
4,71.59,79.26,1557.33,22792,android,SM-G361F


In [25]:
result.tail()

Unnamed: 0,outgoing_mins_per_month,outgoing_sms_per_month,monthly_mb,use_id,platform,device
235,260.66,68.44,896.96,25008,,
236,97.12,36.5,2815.0,25040,,
237,355.93,12.37,6828.09,25046,,
238,632.06,120.46,1453.16,25058,,
239,488.7,906.92,3089.85,25220,,


# Right merge example
A right merge, or right join, between two dataframes keeps all of the rows and values from the right dataframe, in this case "user_device". Rows from the left dataframe will be kept where there is a match in the merge variable, and NaN values will be in the result where not.

In [29]:
result = pd.merge(user_usage,user_device[['use_id', 'platform', 'device']], on='use_id', how='right')
print("user_device dimensions: {}".format(user_device.shape))
print("result dimensions: {}".format(result.shape))
result

user_device dimensions: (272, 6)
result dimensions: (272, 6)


Unnamed: 0,outgoing_mins_per_month,outgoing_sms_per_month,monthly_mb,use_id,platform,device
0,,,,22782,ios,"iPhone7,2"
1,,,,22783,android,Nexus 5
2,,,,22784,android,SM-G903F
3,,,,22785,ios,"iPhone7,2"
4,,,,22786,android,ONE E1003
...,...,...,...,...,...,...
267,344.53,20.53,519.12,23049,android,SM-G900F
268,,,,23050,ios,"iPhone7,2"
269,,,,23051,ios,"iPhone7,2"
270,,,,23052,ios,"iPhone8,4"


In [30]:
print("There are {} missing values in the 'monthly_mb' column in the result.".format(
        result['monthly_mb'].isnull().sum()))
print("There are {} missing values in the 'platform' column in the result.".format(
        result['platform'].isnull().sum()))

There are 113 missing values in the 'monthly_mb' column in the result.
There are 0 missing values in the 'platform' column in the result.


# Outer merge example
A full outer join, or outer merge, keeps all rows from the left and right dataframe in the result. Rows will be aligned where there is shared join values between the left and right, and rows with NaN values, in either the left-originating or right-originating columns will be, will be left in the result where there is no shared join value.

In the final result, a subset of rows should have no missing values. These rows are the rows where there was a match between the merge column in the left and right dataframes. These rows are the same values as found by our inner merge result before.

In [80]:
print("There are {} unique values of use_id in our dataframes.".format(
        pd.concat([user_usage['use_id'], user_device['use_id']]).unique().shape[0]))
result = pd.merge(user_usage,
                 user_device[['use_id', 'platform', 'device']],
                 on='use_id', how='outer', indicator=True)

print("Outer merge result has {} rows.".format(result.shape))



There are 353 unique values of use_id in our dataframes.
Outer merge result has (353, 7) rows.


In [77]:
len(np.unique(np.append(user_usage.use_id.unique(), user_device.use_id.unique())))

353

In [78]:
print("There are {} rows with no missing values.".format(
    (result.apply(lambda x: x.isnull().sum(), axis=1) == 0).sum()))

There are 159 rows with no missing values.


In [32]:
result.iloc[[0, 1, 200,201, 350,351],:]

Unnamed: 0,outgoing_mins_per_month,outgoing_sms_per_month,monthly_mb,use_id,platform,device,_merge
0,21.97,4.82,1557.33,22787,android,GT-I9505,both
1,1710.08,136.88,7267.55,22788,android,SM-G930F,both
200,28.79,29.42,3114.67,23988,,,left_only
201,616.56,99.85,5414.14,24006,,,left_only
350,,,,23050,ios,"iPhone7,2",right_only
351,,,,23051,ios,"iPhone7,2",right_only


# Final merge - adding device manufacturer

In [81]:
# First, add the platform and device to the user usage.
result = pd.merge(user_usage,
                 user_device[['use_id', 'platform', 'device']],
                 on='use_id',
                 how='left')

# Now, based on the "device" column in result, match the "Model" column in devices.
devices.rename(columns={"Retail Branding": "manufacturer"}, inplace=True)
result = pd.merge(result, 
                  devices[['manufacturer', 'Model']],
                  left_on='device',
                  right_on='Model',
                  how='left')

result.head()
              

Unnamed: 0,outgoing_mins_per_month,outgoing_sms_per_month,monthly_mb,use_id,platform,device,manufacturer,Model
0,21.97,4.82,1557.33,22787,android,GT-I9505,Samsung,GT-I9505
1,1710.08,136.88,7267.55,22788,android,SM-G930F,Samsung,SM-G930F
2,1710.08,136.88,7267.55,22789,android,SM-G930F,Samsung,SM-G930F
3,94.46,35.17,519.12,22790,android,D2303,Sony,D2303
4,71.59,79.26,1557.33,22792,android,SM-G361F,Samsung,SM-G361F


In [82]:
devices[devices.Model == 'SM-G930F']

Unnamed: 0,manufacturer,Marketing Name,Device,Model
10381,Samsung,Galaxy S7,herolte,SM-G930F


In [83]:
devices[devices.Device.str.startswith('GT')]

Unnamed: 0,manufacturer,Marketing Name,Device,Model
1095,Bitmore,GTAB700,GTAB700,NID_7010
1096,Bitmore,GTAB900,GTAB900,S952
2402,Grundig,GTB1050,GTB1050,GTB 1050
2403,Grundig,GTB850,GTB850,GTB 850
2404,Grundig,TC69CA2,GTB801,GTB 801
...,...,...,...,...
10821,Samsung,Galaxy Y Pro,GT-B5510L,GT-B5510L
10822,Samsung,Galaxy Y Pro Duos,GT-B5512,GT-B5512
10823,Samsung,Galaxy Y Pro Duos,GT-B5512B,GT-B5512B
10824,Samsung,Galaxy Y TV,GT-S5367,GT-S5367


# Calculating statistics on final result
With merges complete, we can simply calculate statistics for users grouped by the manufacturer of their device.

In [84]:
result.head()

Unnamed: 0,outgoing_mins_per_month,outgoing_sms_per_month,monthly_mb,use_id,platform,device,manufacturer,Model
0,21.97,4.82,1557.33,22787,android,GT-I9505,Samsung,GT-I9505
1,1710.08,136.88,7267.55,22788,android,SM-G930F,Samsung,SM-G930F
2,1710.08,136.88,7267.55,22789,android,SM-G930F,Samsung,SM-G930F
3,94.46,35.17,519.12,22790,android,D2303,Sony,D2303
4,71.59,79.26,1557.33,22792,android,SM-G361F,Samsung,SM-G361F


**Exercise using groupby**
1. For every manufacturer find the average outgoing sms and mins per month per, total count of user_id and median monthly_mb usage