In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# load data
original_df = pd.read_csv('../../data/data_exploration_df.csv')

# make a copy
df = original_df.copy()

### functions

In [None]:
# function that takes in the events_to_analyze and returns the dataframes for each event with the lenght, number of NaN values, and print the names of the created dataframes
def create_event_dataframes(events_to_analyze, df):
    event_dataframes = {}
    for event in events_to_analyze:
        event_dataframes[event] = df[df['event.key'] == event]
        print(event, 'dataframe created')
    return event_dataframes

# function that creates a dataframe where event.key is a NaN value
def create_nan_event_dataframes(df):
    nan_event_df = df[df['event.key'].isna()]
    print('NaN event dataframe created')
    return nan_event_df

# function that takes in the event_dataframes nan_event_df and returns the lenght, number of NaN values for each column in each dataframe
def analyze_event_dataframes(event_dataframes, nan_event_df):
    for event in event_dataframes:
        print(event, 'dataframe')
        print('Lenght:', len(event_dataframes[event]))
        print('Number of unique imei values:', event_dataframes[event]['imei'].nunique())
        print('-----------------------------------')
        print('Number of NaN values:')
        print(event_dataframes[event].isna().sum())
        print('\n')
    print('NaN event dataframe')
    print('Lenght:', len(nan_event_df))
    print('Number of unique imei values:', nan_event_df['imei'].nunique())
    print('-----------------------------------')
    print('Number of NaN values:')
    print(nan_event_df.isna().sum())

# function that shows the describe() of all the events_to_analyze for each event and also for the nan_event_df
def describe_event_dataframes(event_dataframes, nan_event_df):
    for event in event_dataframes:
        print(event, 'dataframe')
        print(event_dataframes[event].describe())
        print('\n')
    print('NaN event dataframe')
    print(nan_event_df.describe())

    # function that plots the histograms of the columns_to_analyze for each event and also for the nan_event_df, but do not create one if the column has no values 
def plot_histograms(event_dataframes, nan_event_df, columns_to_analyze):
    for event in event_dataframes:
        print(event, 'dataframe')
        #if the columns only have NaN values, the histogram will not be created
        for column in columns_to_analyze:
            if event_dataframes[event][column].isna().sum() != len(event_dataframes[event]):
                event_dataframes[event][column].hist()
                plt.title(column)
                plt.show()
    print('NaN event dataframe')
    for column in columns_to_analyze:
        nan_event_df[column].hist()
        plt.title(column)
        plt.show()


### variables

In [None]:
events_to_analyze = ['battery_info', 'crash_data','backup_to_main_battery', 'main_to_backup_battery','rest_to_motion', 'motion_to_rest']
columns_to_analyze = ['tracker.loc.sp',
                      'tracker.metric.bbatp',
                      'tracker.metric.bbatv',
                      'tracker.metric.rssi',
                      'device.metric.bmv',
                      'device.metric.dactualsp',
                      'device.metric.btemp'
                      ]

### execute

In [None]:
event_dataframes = create_event_dataframes(events_to_analyze, df)

In [None]:
nan_event_df = create_nan_event_dataframes(df)

In [None]:
analyze_event_dataframes(event_dataframes, nan_event_df)

In [None]:
describe_event_dataframes(event_dataframes, nan_event_df)

In [None]:
plot_histograms(event_dataframes, nan_event_df, columns_to_analyze)

In [None]:
# show highest tracker.loc.sp for the motion_to_rest df
print('motion_to_rest dataframe')
print(event_dataframes['motion_to_rest'].loc[event_dataframes['motion_to_rest']['tracker.loc.sp'].idxmax()])

In [None]:
#show the highest tracker.loc.sp for the NaN event df
print('NaN event dataframe')
print(nan_event_df.loc[nan_event_df['tracker.loc.sp'].idxmax()])

In [None]:
# show the highest device.metric.dactualsp for the NaN event df
print('NaN event dataframe')
print(nan_event_df.loc[nan_event_df['device.metric.dactualsp'].idxmax()])