In [75]:
import pandas_profiling as pp
import pandas as pd
import numpy as np
from tqdm import tqdm

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

sns.set_style('whitegrid')
pd.set_option('display.max_columns', None) # display all columns

import warnings
warnings.filterwarnings('ignore')
import glob
import re

In [76]:
path = r'../data/dataset/Google_Analytics' # use your path
all_files = glob.glob(path + "/*.csv")
li = []

for filename in all_files:
    df = pd.read_csv(filename,
                     delimiter = ',',
                     encoding='utf-8',
                     dtype={"ga:clientId":"str",
                            "ga:pagepath":"str",
                            "ga:dateHourMinute":"str",
                            "ga:latitude":"float",
                            "ga:longitude":"float",
                            "ga:sourceMedium":"str",
                            "ga:timeOnPage":"int"}
                            )
    li.append(df)

df = pd.concat(li, axis=0, ignore_index=True)

In [88]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 218404 entries, 4 to 295801
Data columns (total 7 columns):
ga:clientId          218404 non-null object
ga:pagepath          218404 non-null bool
ga:dateHourMinute    218404 non-null datetime64[ns]
ga:latitude          218404 non-null float64
ga:longitude         218404 non-null float64
ga:sourceMedium      218404 non-null category
ga:timeOnPage        218404 non-null int64
dtypes: bool(1), category(1), datetime64[ns](1), float64(2), int64(1), object(1)
memory usage: 10.4+ MB


In [78]:
# change to datetime format
df["ga:dateHourMinute"] = pd.to_datetime(df["ga:dateHourMinute"], format="%Y%m%d%H%M")

In [79]:
# only keep inside Belgium and remove zeros
df = df[(df["ga:latitude"] >= 49.398182) & (df["ga:latitude"] <= 51.824562) ]
df = df[(df["ga:longitude"] >= 2.120257) & (df["ga:longitude"] <= 6.398146) ]

In [80]:
df["ga:pagepath"].nunique()

6172

In [81]:
x = []
for i in df["ga:pagepath"]:
    if re.search("article", i):
        x.append("static")
    elif re.search("player", i):
        x.append("live")
    elif re.search("replay", i):
        x.append("live")
    elif re.search("radio", i):
        x.append("live")
    elif re.search("boost", i):
        x.append("static")
    elif re.search("/index", i):
        x.append("static")
    elif re.search("/classements", i):
        x.append("static")
    elif re.search("grille", i):
        x.append("static")
    elif re.search("emission", i):
        x.append("static")
    elif re.search("frequen", i):
        x.append("static")
    else:
        x.append("static")
#print(x)
df["ga:pagepath"] = x

In [82]:
for i in df["ga:pagepath"].unique():
    print(i)

static
live


In [83]:
df = df.astype({'ga:pagepath':'bool'})

In [84]:
df["ga:sourceMedium"].nunique()

152

In [85]:
x = []
for i in df["ga:sourceMedium"]:
    if re.search("faceb", i):
        x.append("referral")
    elif re.search("googl", i):
        x.append("google")
    elif re.search("direct", i):
        x.append("direct")
    elif re.search("referral", i):
        x.append("referral")
    elif re.search("organic", i):
        x.append("google")
    else:
        x.append("google")
#print(x)
df["ga:sourceMedium"] = x

In [86]:
df["ga:sourceMedium"].unique()

array(['referral', 'google', 'direct'], dtype=object)

In [87]:
df = df.astype({'ga:sourceMedium':'category'})

In [89]:
df.to_csv("../data/dataset/clean_google_analytics.csv", index=False)

In [91]:
pp.ProfileReport(df).to_file('../data/report/cleaned_google_analytics-full.html')

In [90]:
#pp.ProfileReport(df.sample(2500)).to_file('../data/report/cleaned_google_analytics.html')