In [1]:
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.cluster import DBSCAN
import datetime

## Load Data
Define a function that imports all the data from the provided csv files

In [2]:
csv_paths = glob.glob("data/*.csv")
csv_filenames = [os.path.splitext(os.path.basename(x))[0] for x in csv_paths]

def load_data_from_csv():
    """Loads the csv files to pandas DataFrames and saves
    them in a dictionary with their filenames as keys"""
    data = dict()
    for csv_filename, csv_path in zip(csv_filenames, csv_paths):
        data[csv_filename] = pd.read_csv(csv_path)
    return data

# Load the data to a Dictionary
data = load_data_from_csv()

Inspect the data, the schema is shown below:

<img src="docs/schema.png">

### Inspect data

In [3]:
for key, values in data.items():
    print(key, "\n")
    print(values.head(), "\n")
    print(values.describe(), "\n")

app_categories 

  grouped_categories  Unnamed: 1
0              Games           1
1           Business           2
2          Education           3
3          Lifestyle           4
4          Utilities           5 

       Unnamed: 1
count   19.000000
mean    10.000000
std      5.627314
min      1.000000
25%      5.500000
50%     10.000000
75%     14.500000
max     19.000000 

app_events 

   event_id               app_id  is_installed  is_active
0         2  5927333115845830913             1          1
1         2 -5720078949152207372             1          0
2         2 -1633887856876571208             1          0
3         2  -653184325010919369             1          1
4         2  8693964245073640147             1          1 

           event_id        app_id  is_installed     is_active
count  3.247307e+07  3.247307e+07    32473067.0  3.247307e+07
mean   1.625564e+06  1.182779e+18           1.0  3.921094e-01
std    9.384682e+05  5.360173e+18           0.0  4.882209e-01
min    2

In [4]:
print(f'There are {len(data["events"]["device_id"].unique())} unique devices in the event data')
print(f'There are {len(data["events"])} unique events')
print(f'There are {len(data["gender_age_train"]["device_id"].unique())} unique devices in the training data')

There are 60865 unique devices in the event data
There are 3252950 unique events
There are 74645 unique devices in the training data


First replace the chinese brands with their english names

In [5]:
# Join chinese brand names to their english equivalents
phone_brand_device_model = pd.merge(data["phone_brand_device_model"],
                                    data["chinese_to_english_brand"],
                                    left_on="phone_brand", right_on="chinese_brand", how="left")

# Fill nans for brands already in english
phone_brand_device_model["brand"] =\
    phone_brand_device_model["brand"].where(phone_brand_device_model["brand"].str.isalpha(),
                                                    phone_brand_device_model["phone_brand"].str.lower())

# We will just work with device brand for the time being
device_data = phone_brand_device_model[["device_id", "brand", "device_model"]]
device_data

Unnamed: 0,device_id,brand,device_model
0,-8890648629457979026,xiaomi,红米
1,1277779817574759137,xiaomi,MI 2
2,5137427614288105724,samsung,Galaxy S4
3,3669464369358936369,sugar,时尚手机
4,-5019277647504317457,samsung,Galaxy Note 2
...,...,...,...
187240,7979541072208733273,xiaomi,MI 4
187241,-187404680852357705,xiaomi,红米2
187242,-2718274279595622821,xiaomi,MI 3
187243,3098391762071677791,vivo,X1


In [6]:
print(f'There are {len(device_data["device_id"].unique())} unique devices in the device data')
# Get rid of duplicates in the device data
device_data_cln = device_data.drop_duplicates("device_id")
print(len(device_data_cln))

There are 186716 unique devices in the device data
186716


There are 6 age group categories for each gender

In [7]:
data["gender_age_train"]["group"].sort_values().unique()

array(['F23-', 'F24-26', 'F27-28', 'F29-32', 'F33-42', 'F43+', 'M22-',
       'M23-26', 'M27-28', 'M29-31', 'M32-38', 'M39+'], dtype=object)

These categories already include the gender but for the purpose of this analysis we will create <br>
our own buckets use the actual age of the users to reduce the number of predicted classes

In [8]:
data["gender_age_train"].rename(columns={"group": "pre_grouped"}, inplace=True)
bins = [0, 22, 26, 28, 31, 38, 100]
labels = ["22-", "23-26", "27-28", "29-31", "32-38", "39+"]
data["gender_age_train"]["group"] = pd.cut(data["gender_age_train"]["age"], bins=bins, labels=labels)
data["gender_age_train"]

Unnamed: 0,device_id,gender,age,pre_grouped,group
0,-8076087639492063270,M,35,M32-38,32-38
1,-2897161552818060146,M,35,M32-38,32-38
2,-8260683887967679142,M,35,M32-38,32-38
3,-4938849341048082022,M,30,M29-31,29-31
4,245133531816851882,M,30,M29-31,29-31
...,...,...,...,...,...
74640,4682031842235089751,M,30,M29-31,29-31
74641,-9178703742877135986,M,30,M29-31,29-31
74642,180946546684162312,M,20,M22-,22-
74643,1390702386071991851,M,37,M32-38,32-38


We'll proceed to join the device brand data to the train data

In [9]:
combined_data = data["gender_age_train"]
print(len(combined_data))
combined_data = pd.merge(combined_data, device_data_cln, on="device_id", how="left")
combined_data.drop(columns=["pre_grouped"], inplace=True)

74645


Put together the event data before merging all.
Start with app data:

In [10]:
# Join app labels to their label categories
app_labels = data["app_labels"].drop_duplicates()
app_labels = app_labels.merge(data["label_categories"], on="label_id")
app_labels

Unnamed: 0,app_id,label_id,category
0,7324884708820027918,251,Finance
1,-4494216993218550286,251,Finance
2,8756705988821000489,251,Finance
3,1061207043315821111,251,Finance
4,-1491198667294647703,251,Finance
...,...,...,...
459447,-5364349117796323466,985,Word games
459448,6320161160475087425,985,Word games
459449,7641656572760613661,985,Word games
459450,-2037260970074339790,985,Word games


For each entry in the events data there is a number of apps in the app events table that the user used

In [11]:
# Join events to their app events
events = data["events"]
events = events.merge(data["app_events"], on="event_id", how="left")

# Use app events to join events to the app labels
events = events.merge(app_labels, on="app_id", how="left")
events.sort_values(by="category")

Unnamed: 0,event_id,device_id,timestamp,longitude,latitude,app_id,is_installed,is_active,label_id,category
27741865,2545490,2241767092438554986,2016-05-01 06:48:43,119.94,37.17,5.322487e+16,1.0,0.0,714.0,1 free
25742018,2360757,-8892004110917383868,2016-05-06 10:09:48,0.00,0.00,-4.986197e+15,1.0,1.0,714.0,1 free
25741714,2360713,-8642981685332370359,2016-05-06 10:01:40,0.00,0.00,-4.986197e+15,1.0,0.0,714.0,1 free
11596043,1064261,-2613769140653670990,2016-05-05 12:24:46,0.00,0.00,-4.986197e+15,1.0,0.0,714.0,1 free
25739672,2360561,7683263732254566184,2016-05-02 22:54:45,0.00,0.00,5.322487e+16,1.0,0.0,714.0,1 free
...,...,...,...,...,...,...,...,...,...,...
35452563,3252948,7111660304904287709,2016-05-07 23:35:33,121.40,31.18,4.348660e+18,1.0,1.0,,
35452564,3252948,7111660304904287709,2016-05-07 23:35:33,121.40,31.18,-9.957269e+17,1.0,1.0,,
35452565,3252948,7111660304904287709,2016-05-07 23:35:33,121.40,31.18,2.306067e+18,1.0,1.0,,
35452566,3252949,2786044170600788970,2016-05-07 23:36:03,111.19,31.29,,,,,


In [12]:
combined_data = pd.merge(combined_data, events, on="device_id", how="left")
combined_data

Unnamed: 0,device_id,gender,age,group,brand,device_model,event_id,timestamp,longitude,latitude,app_id,is_installed,is_active,label_id,category
0,-8076087639492063270,M,35,32-38,xiaomi,MI 2,,,,,,,,,
1,-2897161552818060146,M,35,32-38,xiaomi,MI 2,,,,,,,,,
2,-8260683887967679142,M,35,32-38,xiaomi,MI 2,2479656.0,2016-05-01 14:23:37,0.00,0.00,4.287147e+18,1.0,0.0,,
3,-8260683887967679142,M,35,32-38,xiaomi,MI 2,2479656.0,2016-05-01 14:23:37,0.00,0.00,-4.085687e+18,1.0,0.0,,
4,-8260683887967679142,M,35,32-38,xiaomi,MI 2,2479656.0,2016-05-01 14:23:37,0.00,0.00,6.324195e+18,1.0,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13405307,89181010588227347,M,25,23-26,xiaomi,MI 2S,2218742.0,2016-05-05 19:28:40,0.00,0.00,1.193348e+18,1.0,0.0,,
13405308,89181010588227347,M,25,23-26,xiaomi,MI 2S,2243803.0,2016-05-06 17:12:46,0.00,0.00,5.516228e+18,1.0,1.0,,
13405309,89181010588227347,M,25,23-26,xiaomi,MI 2S,2336186.0,2016-05-04 14:08:05,119.26,26.05,,,,,
13405310,89181010588227347,M,25,23-26,xiaomi,MI 2S,3139128.0,2016-05-04 14:08:35,119.26,26.05,,,,,


In [None]:
# Drop id columns
combined_data.drop(columns=["event_id", "app_id", "label_id","age"], inplace=True)

In [None]:
combined_data_grouped = combined_data.groupby(["device_id", "gender", "group", "brand", "device_model",
                                               "longitude", "latitude"])[["is_installed", "is_active", "category"]].agg(lambda x: list(x)).reset_index()

#### Filling nan values
Drop event_id, app_id and label_id as they are not required for predictions

In [13]:

# Fill nan category values with "unknown"
combined_data["category"].fillna("unknown", inplace=True)
# Fill is_installed and is_active category values with 3 for "unknown"
combined_data[["is_installed", "is_active"]] = combined_data[["is_installed", "is_active"]].apply(lambda x: x.fillna(3))
# Fill in missing timestamps with 00:00:00 time
combined_data["timestamp"].fillna("2016-05-01 00:00:00", inplace=True)
# Get time from timestamp object
combined_data['timestamp'] = pd.to_datetime(combined_data['timestamp'])
combined_data["time"] = combined_data['timestamp'].dt.time

Group data by id and get a list of all active and installed apps and category

In [14]:
combined_data

Unnamed: 0,device_id,gender,group,brand,device_model,timestamp,longitude,latitude,is_installed,is_active,category,time
0,-8076087639492063270,M,32-38,xiaomi,MI 2,2016-05-01 00:00:00,,,3.0,3.0,unknown,00:00:00
1,-2897161552818060146,M,32-38,xiaomi,MI 2,2016-05-01 00:00:00,,,3.0,3.0,unknown,00:00:00
2,-8260683887967679142,M,32-38,xiaomi,MI 2,2016-05-01 14:23:37,0.00,0.00,1.0,0.0,unknown,14:23:37
3,-8260683887967679142,M,32-38,xiaomi,MI 2,2016-05-01 14:23:37,0.00,0.00,1.0,0.0,unknown,14:23:37
4,-8260683887967679142,M,32-38,xiaomi,MI 2,2016-05-01 14:23:37,0.00,0.00,1.0,0.0,unknown,14:23:37
...,...,...,...,...,...,...,...,...,...,...,...,...
13405307,89181010588227347,M,23-26,xiaomi,MI 2S,2016-05-05 19:28:40,0.00,0.00,1.0,0.0,unknown,19:28:40
13405308,89181010588227347,M,23-26,xiaomi,MI 2S,2016-05-06 17:12:46,0.00,0.00,1.0,1.0,unknown,17:12:46
13405309,89181010588227347,M,23-26,xiaomi,MI 2S,2016-05-04 14:08:05,119.26,26.05,3.0,3.0,unknown,14:08:05
13405310,89181010588227347,M,23-26,xiaomi,MI 2S,2016-05-04 14:08:35,119.26,26.05,3.0,3.0,unknown,14:08:35


In [19]:
combined_data_grouped

Unnamed: 0,device_id,gender,is_installed,is_active,category
0,-9223067244542181226,M,[3.0],[3.0],[unknown]
1,-9222956879900151005,M,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[unknown, unknown, unknown, unknown, unknown, ..."
2,-9222754701995937853,M,[3.0],[3.0],[unknown]
3,-9222352239947207574,M,[3.0],[3.0],[unknown]
4,-9222173362545970626,F,[3.0],[3.0],[unknown]
...,...,...,...,...,...
74640,9220914901466458680,M,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[Property Industry 1.0, Industry tag, Personal..."
74641,9221152396628736959,M,[3.0],[3.0],[unknown]
74642,9221608286127666096,F,[3.0],[3.0],[unknown]
74643,9221843411551060582,M,[3.0],[3.0],[unknown]


In [None]:
combined_data_grouped.isna().sum()

In order to use the coordinates we will group them into clusters
This [tutorial](https://levelup.gitconnected.com/clustering-gps-co-ordinates-forming-regions-4f50caa7e4a1) demonstrates clustering latitude-longitude spatial data with DBSCAN/haversine and avoids Euclidean-distance problems.

In [None]:
# Clean coordinates from nan values
# coords = combined_data[['longitude', 'latitude']].dropna()
# coord_values = coords.values

In [None]:
# db = DBSCAN(eps=0.5, min_samples=10000, algorithm='ball_tree', metric='haversine').fit(np.radians(coord_values))

#### Split dataset into train, validation and test sets.

In [None]:
FEATURE_COLUMNS = ["brand", "device_model", "timestamp", "is_installed", "is_active", "category"]
LABELS = ["gender", "group"]

# Prepare the labels
y = combined_data[LABELS]
y = pd.get_dummies(y, prefix='', prefix_sep='')
y.sort_values(by="F")

Use number of installed apps?????????

In [None]:
# Prepare the features
X = combined_data[FEATURE_COLUMNS]
X

In [None]:
# Split into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# Split into train set further into train and validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

#### Predictions

We will first try to predict the gender and then use that to predict the age group since the age group bucket will depend on the gender