In [1]:
import pandas_profiling as pp
import pandas as pd
import numpy as np
from tqdm import tqdm

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

sns.set_style('whitegrid')
pd.set_option('display.max_columns', None) # display all columns

import warnings
warnings.filterwarnings('ignore')
import glob
import re

In [2]:
path = r'../data/dataset/A_DB_NRJ' # use your path
all_files = glob.glob(path + "/*.csv")
li = []

for filename in all_files:
    df = pd.read_csv(filename,
                     delimiter = ',',
                     encoding='utf-8')
    li.append(df)

df = pd.concat(li, axis=0, ignore_index=True)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20704510 entries, 0 to 20704509
Data columns (total 21 columns):
id                          int64
googleId                    object
hotjarId                    object
url                         object
dateHour                    int64
UserAgent                   object
Latitude                    float64
Longitude                   float64
orientationAlpha            float64
orientationBeta             float64
orientationGamma            float64
accelerationX               float64
accelerationY               float64
accelerationZ               float64
accelerationInclGravityX    float64
accelerationInclGravityY    float64
accelerationInclGravityZ    float64
rotationX                   float64
rotationY                   float64
rotationZ                   float64
Key_IP                      float64
dtypes: float64(15), int64(2), object(4)
memory usage: 3.2+ GB


In [4]:
# show sparse/dense
df.ftypes

id                            int64:dense
googleId                     object:dense
hotjarId                     object:dense
url                          object:dense
dateHour                      int64:dense
UserAgent                    object:dense
Latitude                    float64:dense
Longitude                   float64:dense
orientationAlpha            float64:dense
orientationBeta             float64:dense
orientationGamma            float64:dense
accelerationX               float64:dense
accelerationY               float64:dense
accelerationZ               float64:dense
accelerationInclGravityX    float64:dense
accelerationInclGravityY    float64:dense
accelerationInclGravityZ    float64:dense
rotationX                   float64:dense
rotationY                   float64:dense
rotationZ                   float64:dense
Key_IP                      float64:dense
dtype: object

In [5]:
df.sample(4)

Unnamed: 0,id,googleId,hotjarId,url,dateHour,UserAgent,Latitude,Longitude,orientationAlpha,orientationBeta,orientationGamma,accelerationX,accelerationY,accelerationZ,accelerationInclGravityX,accelerationInclGravityY,accelerationInclGravityZ,rotationX,rotationY,rotationZ,Key_IP
18161373,964937,GA1.2.110452929.1562495723,5c114fbf-1a95-40d5-80cf-b073a6bd74a8,/radioplayer,2019100813,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:6...,50.835582,4.311017,,,,,,,,,,,,,499353.0
3436600,491143,GA1.2.2004138789.1564560840,26f9002e-68f4-49bc-bb3a-32d6763ba6ae,/radioplayer,2019101213,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,480284.0
10798423,910738,GA1.2.1704884302.1516050824,6251b74f-7ad9-4f1c-a4a1-5f4d7baf1796,/radioplayer/NRJhits2000,2019101413,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:6...,0.0,0.0,,,,,,,,,,,,,453455.0
12583785,514395,GA1.2.1948254940.1555333031,6f9efd28-1abd-4028-8ece-3e9fc201c08d,/radioplayer,2019101509,Mozilla/5.0 (Windows NT 6.1; Win64; x64) Apple...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,


In [6]:
df["accelerationX"].nunique()

1325311

In [7]:
# if value make 1, if NaN make 0
df["accelerationX"] = ~df["accelerationX"].isna()

In [8]:
df["accelerationX"].nunique()

2

In [9]:
# rename
df.rename(columns={'accelerationX': 'moved_mobile'}, inplace=True)

In [10]:
df = df.astype({'moved_mobile':'bool'})

In [11]:
# drop these features:
df.drop(["accelerationInclGravityX",
         "accelerationInclGravityY",
         "accelerationInclGravityZ",
         "accelerationY",
         "accelerationZ",
         "rotationX",
         "rotationY",
         "rotationZ",
         "Latitude",
         "Longitude",
         "id",
         "orientationAlpha",
         "orientationBeta",
         "orientationGamma",
         "url"], axis=1, inplace=True)

In [12]:
# change to datetime format
df["dateHour"] = pd.to_datetime(df["dateHour"], format="%Y%m%d%H")

In [13]:
sum(df["googleId"].isna())

845621

In [14]:
df[df["googleId"].isna()].sample(4)

Unnamed: 0,googleId,hotjarId,dateHour,UserAgent,moved_mobile,Key_IP
19467920,,,2019-10-09 12:00:00,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,False,492774.0
17574827,,,2019-10-08 08:00:00,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:6...,False,409909.0
13696934,,,2019-10-07 07:00:00,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15) ...,False,478829.0
19668973,,,2019-10-09 14:00:00,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,False,492774.0


In [15]:
# remove the missing rows, hotjarid is also missing...
df.dropna(subset=['googleId', 'hotjarId'], inplace=True, axis=0)

In [16]:
# let's remove the first few characters ... GA1.2. from "googleId" trick is to drop the first 6 characters
df["googleId"] = df["googleId"].str[6:]

In [17]:
# remove all missing values for the keys, then change the values in integers
df.dropna(subset=['Key_IP'], inplace=True, axis=0)
df = df.astype({"Key_IP":'int'})

In [18]:
# we can remove useragent:
df.drop(["UserAgent"], axis=1, inplace=True)

In [26]:
df.to_csv("../data/dataset/clean_A_DB_NRJ.csv", index=False)

In [19]:
pp.ProfileReport(df.sample(3500)).to_file('../data/report/clean_A_DB_NRJ-full.html')