In [1]:
import pandas_profiling as pp
import pandas as pd
import numpy as np
from tqdm import tqdm

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

sns.set_style('whitegrid')
pd.set_option('display.max_columns', None) # display all columns

import warnings
warnings.filterwarnings('ignore')
import glob
import re

In [2]:
path = r'../data/dataset/A_DB_NRJ' # use your path
all_files = glob.glob(path + "/*.csv")
li = []

for filename in all_files:
    df = pd.read_csv(filename,
                     delimiter = ',',
                     encoding='utf-8')
    li.append(df)

df = pd.concat(li, axis=0, ignore_index=True)

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19333056 entries, 0 to 20704509
Data columns (total 11 columns):
googleId        object
hotjarId        object
dateHour        datetime64[ns]
UserAgent       object
Latitude        float64
Longitude       float64
moved_mobile    bool
rotationX       float64
rotationY       float64
rotationZ       float64
Key_IP          int64
dtypes: bool(1), datetime64[ns](1), float64(5), int64(1), object(3)
memory usage: 1.6+ GB


In [42]:
# show sparse/dense
df.ftypes

googleId                object:dense
hotjarId                object:dense
dateHour        datetime64[ns]:dense
UserAgent               object:dense
Latitude               float64:dense
Longitude              float64:dense
moved_mobile              bool:dense
rotationX              float64:dense
rotationY              float64:dense
rotationZ              float64:dense
Key_IP                 float64:dense
dtype: object

In [53]:
df.sample(4)

Unnamed: 0,googleId,hotjarId,dateHour,UserAgent,Latitude,Longitude,moved_mobile,rotationX,rotationY,rotationZ,Key_IP
9497348,654285929.1570965,c0091724-912d-46da-884d-b3e606a47b33,2019-10-13 11:00:00,Mozilla/5.0 (Linux; Android 6.0.1; SM-A500FU) ...,50.260204,5.753629,True,,,,584302.0
11527288,957614937.157147,0de5cbf3-45f6-45c5-b3ec-57b4ee77d61c,2019-10-19 09:00:00,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6...,0.0,0.0,False,,,,632796.0
12997026,1739700094.1540537,794964bb-89eb-40aa-b05b-e5cc4a16186c,2019-10-15 13:00:00,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:6...,0.0,0.0,False,,,,411106.0
9468375,1060492325.1570888,5bb7d205-1267-4b9d-8d04-7adff958a4ba,2019-10-13 10:00:00,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:6...,0.0,0.0,False,,,,571801.0


In [6]:
df["accelerationX"].nunique()

1325311

In [18]:
# if value make 1, if NaN make 0
df["accelerationX"] = ~df["accelerationX"].isna()

In [19]:
df["accelerationX"].nunique()

2

In [20]:
# rename
df.rename(columns={'accelerationX': 'moved_mobile'}, inplace=True)

In [22]:
df = df.astype({'moved_mobile':'bool'})

In [21]:
# drop these features:
df.drop(["accelerationInclGravityX",
         "accelerationInclGravityY",
         "accelerationInclGravityZ",
         "accelerationY",
         "accelerationZ",
         "id",
         "orientationAlpha",
         "orientationBeta",
         "orientationGamma",
         "url"], axis=1, inplace=True)

In [24]:
# change to datetime format
df["dateHour"] = pd.to_datetime(df["dateHour"], format="%Y%m%d%H")

In [32]:
sum(df["googleId"].isna())

845621

In [35]:
df[df["googleId"].isna()].sample(4)

Unnamed: 0,googleId,hotjarId,dateHour,UserAgent,Latitude,Longitude,moved_mobile,rotationX,rotationY,rotationZ,Key_IP
12819699,,,2019-10-15 11:00:00,Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebK...,50.8333,4.3333,False,,,,
4328318,,,2019-10-11 09:00:00,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,50.871018,4.254664,False,,,,
6090114,,,2019-10-18 08:00:00,Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:69...,0.0,0.0,False,,,,
1644971,,,2019-10-17 07:00:00,Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7....,50.5011,3.6627,False,,,,410528.0


In [37]:
# remove the missing rows, hotjarid is also missing...
df.dropna(subset=['googleId', 'hotjarId'], inplace=True, axis=0)

In [52]:
# let's remove the first few characters ... GA1.2. from "googleId" trick is to drop the first 6 characters
df["googleId"] = df["googleId"].str[6:]

In [56]:
# remove all missing values for the keys, then change the values in integers
df.dropna(subset=['Key_IP'], inplace=True, axis=0)
df = df.astype({"Key_IP":'int'})

In [60]:
# we can remove useragent:
df.drop(["UserAgent"], axis=1, inplace=True)

In [None]:
df.to_csv("../data/dataset/clean_A_DB_NRJ.csv", index=False)

In [None]:
pp.ProfileReport(df).to_file('../data/report/clean_A_DB_NRJ-full.html')