In [2]:
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Define a function that imports all the data from the provided csv files

In [3]:
csv_paths = glob.glob("data/*.csv")
csv_filenames = [os.path.splitext(os.path.basename(x))[0] for x in csv_paths]

def load_data_from_csv():
    """Loads the csv files to pandas DataFrames and saves
    them in a dictionary with their filenames as keys"""
    data = dict()
    for csv_filename, csv_path in zip(csv_filenames, csv_paths):
        data[csv_filename] = pd.read_csv(csv_path)
    return data

# Load the data to a Dictionary
data = load_data_from_csv()

Inspect the data, the schema is shown below:

<img src="docs/schema.png">

In [4]:
for key, values in data.items():
    print(key, "\n")
    print(values.head(), "\n")
    print(values.describe(), "\n")

app_categories 

  grouped_categories  Unnamed: 1
0              Games           1
1           Business           2
2          Education           3
3          Lifestyle           4
4          Utilities           5 

       Unnamed: 1
count   19.000000
mean    10.000000
std      5.627314
min      1.000000
25%      5.500000
50%     10.000000
75%     14.500000
max     19.000000 

app_events 

   event_id               app_id  is_installed  is_active
0         2  5927333115845830913             1          1
1         2 -5720078949152207372             1          0
2         2 -1633887856876571208             1          0
3         2  -653184325010919369             1          1
4         2  8693964245073640147             1          1 

           event_id        app_id  is_installed     is_active
count  3.247307e+07  3.247307e+07    32473067.0  3.247307e+07
mean   1.625564e+06  1.182779e+18           1.0  3.921094e-01
std    9.384682e+05  5.360173e+18           0.0  4.882209e-01
min    2

In [15]:
print(f'There are {len(data["events"]["device_id"].unique())} unique devices')

There are 60865 unique devices


In [17]:
print(f'There are {len(data["events"])} unique events')

There are 3252950 unique events


In [19]:
print(f'There are {len(data["gender_age_train"]["device_id"].unique())} unique devices in the training data')

There are 74645 unique devices in the training data


First replace the chinese brands with their english names

In [21]:
# Join chinese brand names to their english equivalents
phone_brand_device_model = pd.merge(data["phone_brand_device_model"],
                                    data["chinese_to_english_brand"],
                                    left_on="phone_brand", right_on="chinese_brand", how="left")

# Fill nans for brands already in english
phone_brand_device_model["brand"] =\
    phone_brand_device_model["brand"].where(phone_brand_device_model["brand"].str.isalpha(),
                                                    phone_brand_device_model["phone_brand"].str.lower())

# We will just work with device brand for the time being
device_data = phone_brand_device_model[["device_id", "brand", "device_model"]]
device_data

Unnamed: 0,device_id,brand,device_model
0,-8890648629457979026,xiaomi,红米
1,1277779817574759137,xiaomi,MI 2
2,5137427614288105724,samsung,Galaxy S4
3,3669464369358936369,sugar,时尚手机
4,-5019277647504317457,samsung,Galaxy Note 2
...,...,...,...
187240,7979541072208733273,xiaomi,MI 4
187241,-187404680852357705,xiaomi,红米2
187242,-2718274279595622821,xiaomi,MI 3
187243,3098391762071677791,vivo,X1


In [32]:
print(f'There are {len(device_data["device_id"].unique())} unique devices in the device data')

There are 186716 unique devices in the device data


In [34]:
# Get rid of duplicates in the device data
device_data_cln = device_data.drop_duplicates()

We'll proceed to join the device brand data to the train data

In [23]:
data["gender_age_train"]["group"].sort_values().unique()

array(['F23-', 'F24-26', 'F27-28', 'F29-32', 'F33-42', 'F43+', 'M22-',
       'M23-26', 'M27-28', 'M29-31', 'M32-38', 'M39+'], dtype=object)

In [35]:
combined_data = data["gender_age_train"]
print(len(combined_data))
combined_data = pd.merge(combined_data, device_data_cln, on="device_id", how="left")
combined_data

74645


Unnamed: 0,device_id,gender,age,group,brand,device_model
0,-8076087639492063270,M,35,M32-38,xiaomi,MI 2
1,-2897161552818060146,M,35,M32-38,xiaomi,MI 2
2,-8260683887967679142,M,35,M32-38,xiaomi,MI 2
3,-4938849341048082022,M,30,M29-31,xiaomi,红米note
4,245133531816851882,M,30,M29-31,xiaomi,MI 3
...,...,...,...,...,...,...
74641,4682031842235089751,M,30,M29-31,xiaomi,MI 3
74642,-9178703742877135986,M,30,M29-31,xiaomi,MI 3
74643,180946546684162312,M,20,M22-,xiaomi,红米note
74644,1390702386071991851,M,37,M32-38,huawei,Ascend P8


Put together the event data before merging all

In [147]:
# Join app labels to their label categories
app_labels = data["app_labels"]
app_labels = app_labels.merge(data["label_categories"], on="label_id")
app_labels

Unnamed: 0,app_id,label_id,category
0,7324884708820027918,251,Finance
1,-4494216993218550286,251,Finance
2,8756705988821000489,251,Finance
3,1061207043315821111,251,Finance
4,-1491198667294647703,251,Finance
...,...,...,...
459938,-5364349117796323466,985,Word games
459939,6320161160475087425,985,Word games
459940,7641656572760613661,985,Word games
459941,-2037260970074339790,985,Word games


In [156]:
app_labels["category"].unique()

array(['Finance', 'unknown', 'DS_P2P net loan', 'Securities',
       'Lottery ticket', 'IMF', 'Precious Metals', 'pursue',
       'Science and Technology', 'Customization', 'service',
       'Smart Shopping', 'Personal Effectiveness', 'free', 'vitality',
       'Trendy / cool', 'Relatives', 'comfortable', 'Simple',
       'Debit and credit', 'Crowdfunding financing', 'Custom label',
       'Ping', 'safety Insurance', 'Peace - Search', 'mobile bank',
       'Direct Bank', 'futures', 'Property Industry 1.0', 'Industry tag',
       'Pay', 'Wealth Management', 'Customized 1', '1 free', 'Cozy 1',
       'Services 1', 'Pursuit 1', 'Noble 1', 'Trendy / cool 1',
       'Passion 1', 'Personal Effectiveness 1', 'Total Cost 1',
       'Smart Shopping 1', 'Science and Technology 1', 'Relatives 1',
       '1 vitality', 'Irritation / Fun 1', 'Quality 1', '1 reputation',
       'Nature 1', 'Simple 1', 'Classical 1', 'Property Industry 2.0',
       'Occupational identity', 'College Students',
       '

In [157]:
# Join events to their app events
events = data["events"]
events = events.merge(data["app_events"], on="event_id", how="left")
events

Unnamed: 0,event_id,device_id,timestamp,longitude,latitude,app_id,is_installed,is_active
0,1,29182687948017175,2016-05-01 00:55:25,121.38,31.24,,,
1,2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97,5.927333e+18,1.0,1.0
2,2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97,-5.720079e+18,1.0,0.0
3,2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97,-1.633888e+18,1.0,0.0
4,2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97,-6.531843e+17,1.0,1.0
...,...,...,...,...,...,...,...,...
34237916,3252948,7111660304904287709,2016-05-07 23:35:33,121.40,31.18,4.348660e+18,1.0,1.0
34237917,3252948,7111660304904287709,2016-05-07 23:35:33,121.40,31.18,-9.957269e+17,1.0,1.0
34237918,3252948,7111660304904287709,2016-05-07 23:35:33,121.40,31.18,2.306067e+18,1.0,1.0
34237919,3252949,2786044170600788970,2016-05-07 23:36:03,111.19,31.29,,,


In [158]:
# Use app events to join events to the app labels
events = events.merge(app_labels, on="app_id", how="left")
events.sort_values(by="category")

Unnamed: 0,event_id,device_id,timestamp,longitude,latitude,app_id,is_installed,is_active,label_id,category
20895255,1916674,755772167887724190,2016-05-02 22:27:56,0.00,0.00,-4.986197e+15,1.0,0.0,714.0,1 free
5683103,519585,-423363963231382188,2016-05-06 15:26:29,123.47,41.80,-4.986197e+15,1.0,0.0,714.0,1 free
1444833,129409,6139419753828940858,2016-05-02 11:21:14,0.00,0.00,5.322487e+16,1.0,0.0,714.0,1 free
5682615,519541,-3459418399482569870,2016-05-05 15:35:17,0.00,0.00,5.322487e+16,1.0,1.0,714.0,1 free
20445321,1873865,3517190067519579513,2016-05-04 00:40:30,117.02,32.64,-8.447991e+18,1.0,0.0,714.0,1 free
...,...,...,...,...,...,...,...,...,...,...
35455129,3252948,7111660304904287709,2016-05-07 23:35:33,121.40,31.18,4.348660e+18,1.0,1.0,,
35455130,3252948,7111660304904287709,2016-05-07 23:35:33,121.40,31.18,-9.957269e+17,1.0,1.0,,
35455131,3252948,7111660304904287709,2016-05-07 23:35:33,121.40,31.18,2.306067e+18,1.0,1.0,,
35455132,3252949,2786044170600788970,2016-05-07 23:36:03,111.19,31.29,,,,,
