# Mysteries of the Cats : let's explore the data

In [1]:
# Imports
import pandas as pd
import numpy as np

# Plotly
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)

In [2]:
# General settings
dataPath = 'D:\ScalaProjects\\b-yond\microservice\challenge-4\data'
file = 'moods_1000cats1sec1hour.csv'
pathFile = f"{dataPath}\\{file}"
print(pathFile)

D:\ScalaProjects\b-yond\microservice\challenge-4\data\moods_1000cats1sec1hour.csv


## Pre-processing
In order to do the calculations, we need to create the number of cats in a mood for each time step.

As each cat is independent, the mood change can appear with few seconds of shift among the cats. Thus, we can't just count the number of cats at the second S, but instead we need to proceed second by second, applying the changes as they appear.

For example :

At t0, [mood 1 : 10 cats, mood 2 : 15 cats]

At t1, changes are {mood 1 -> mood 2 : 4, mood 2 -> mood 1 : 7} => [mood 1 : 13 cats, mood 2 : 12 cats]
- mood 1 -> mood 2 : 4 => 4 cats are leaving mood 1 (mood 1 : 10 - 4 = 6) and 4 cats are now mood 2 (mood 2 : 15 + 4 = 19)
- mood 2 -> mood 1 : 7 => 7 cats are leaving mood 2 (mood 2 : 19 - 7 = 12) and 7 cats are now mood 1 (mood 1 : 6 + 7 = 13)

**Idea of improvement :** this methodology can be processed as a Streaming process to avoid Batch Loading (very time consuming and memory costly)

In [3]:
%%time
# Get data and transform timestamps to seconds
raw_data = pd.read_csv(pathFile)
raw_data['datetime'] = raw_data['datetime'] - raw_data['datetime'][0]
raw_data['datetime'] = round(raw_data['datetime']/1000)
raw_data.head()

Wall time: 2.32 s


In [4]:
# Print number of rows
len(raw_data)

3600000

In [5]:
# Group data by (datetime, mood) in two matrices
## new_data : (datetime, mood) represents the cats going to mood (add X cats to mood)
## prev_data : (datetime, prev) represents the cats leaving the mood (remove X cats to mood)

def data2matrix(data, groupby_col, col_del):
    mtx = data.groupby(groupby_col).count()
    del mtx[col_del]
    mtx = mtx.unstack()
    mtx.fillna(0, inplace=True)
    mtx.columns = mtx.columns.droplevel()
    mtx.columns.name = 'mood_mvt'
    return mtx

In [6]:
%%time
# New Data
new_data = data2matrix(raw_data, ['datetime', 'mood'], 'prev')
new_data.head()

Wall time: 933 ms


In [7]:
%%time
# Prev Data
prev_data = data2matrix(raw_data, ['datetime', 'prev'], 'mood')
prev_data.values[0] = 0 # Initial state : no negative movement
prev_data.head()

Wall time: 905 ms


In [10]:
%%time
# Mood_mvt_data represents how the mood evolve each second
mood_mvt_data = new_data - prev_data
mood_mvt_data.head()

Wall time: 997 µs


In [11]:
%%time
# Finally, applying the changes one by one returns to proceed cumsum
data = mood_mvt_data.cumsum()
data.head()

Wall time: 988 µs


## Draw statistics

### Graph 1 : number of cats over time
Stacked bar is much more representative than Scatter plot because of the randomness

In [12]:
%%time
g1_data = []
for mood in data.columns:
    trace = go.Bar(
        x = data.index.values,
        y = data[mood].values,
        name = mood
    )
    g1_data.append(trace)

g1_layout = go.Layout(
    barmode='stack'
)

g1_fig = go.Figure(data=g1_data, layout=g1_layout)
py.iplot(g1_fig)

Wall time: 1.5 s


### Graph 2 : mean over time
For each time t, we calculate the mean over [0-t] interval (this is the evolution of the mean over time).
To accelerate calculus and avoid re-doing the sum of the previous number, we use cumsum().

In [13]:
%%time
avg_data = data.cumsum()
avg_data['row'] = range(1,len(avg_data)+1)
for mood in data.columns:
    avg_data[mood] = avg_data[mood]/avg_data['row']
del avg_data['row']
avg_data.head()

Wall time: 7.99 ms


In [16]:
%%time
g2_data = []
for mood in avg_data.columns:
    trace = go.Scatter(
        x = avg_data.index.values,
        y = avg_data[mood].values,
        name = mood
    )
    g2_data.append(trace)

g2_layout = go.Layout(
)

g2_fig = go.Figure(data=g2_data, layout=g2_layout)
py.iplot(g2_fig)

Wall time: 869 ms


### Graph 3 : variance over time
For each time t, we calculate the variance over [0-t] interval (this is the evolution of the variance over time).

In [17]:
%%time
# Variance = 1/n * SUM(data**2) - mean**2
# Sum of the squares
var_data = data*data
var_data = var_data.cumsum()
# Mean of the squares
var_data['row'] = range(1,len(avg_data)+1)
for mood in data.columns:
    var_data[mood] = var_data[mood]/var_data['row']
del var_data['row']
# Substract mean**2
var_data = var_data - avg_data*avg_data
var_data.head()

Wall time: 11 ms


In [18]:
%%time
g3_data = []
for mood in var_data.columns:
    trace = go.Scatter(
        x = var_data.index.values,
        y = var_data[mood].values,
        name = mood
    )
    g3_data.append(trace)

g3_layout = go.Layout(
)

g3_fig = go.Figure(data=g3_data, layout=g3_layout)
py.iplot(g3_fig)

Wall time: 634 ms


### Graph 4 : boxplot overview

In [19]:
%%time
g4_data = []
for mood in data.columns:
    trace = go.Box(
        y = data[mood].values,
        name = mood,
        boxmean = 'sd'
    )
    g4_data.append(trace)

g4_layout = go.Layout(
)

g4_fig = go.Figure(data=g4_data, layout=g4_layout)
py.iplot(g4_fig)

Wall time: 258 ms
