<a href="https://colab.research.google.com/github/bvm84/open_colab_ml/blob/master/BP_data_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from IPython.display import clear_output
from google.colab import drive
drive.mount('/content/drive')
import sys
import os
import numpy as np
import pandas as pd
import seaborn as sns
import feather
import logging
from pathlib import PurePath
from pandas import DataFrame
sys.path.append('/content/drive/My Drive/Colab/l2_ML/')
% matplotlib inline
np.random.seed(0)
clear_output()

In [0]:
class Db():

    def __init__(self):
        self._df = None

    @staticmethod
    def create_df(wav_folder, l2_folder, pid, features_list):
        allowed_features = ('HR', 'MAB2', 'MAD3P', 'MADP', 'MASEP', 'MASEPMax', 'MASRP', 'MB0',
                            'MB1', 'MB2', 'MD3P', 'MDP', 'MFDP', 'MFSEP', 'MFSRP', 'MPI', 'MRR',
                            'MS4S1', 'MS5S2', 'MSEP', 'MSEPMax', 'MSNR', 'MSRP', 'MWaveType',
                            'MdD3P', 'MdDP', 'MdSEP', 'MdSEPMax', 'MdSRP', 'BR')
        for feature in features_list:
            if feature not in allowed_features:
                raise ValueError("Unknown feature: {0}, expected one of {1}".format(features_list, allowed_features))

        pid_cid_sys_dia_features = []
        wav_glob_ext = '*.wav'
        for filename in glob.iglob(os.path.join(wav_folder, '') + wav_glob_ext, recursive=True):
            pid = pid
            cid = os.path.splitext(PurePath(filename).parts[-1])[0].split('_')[0]
            sys = int(os.path.splitext(PurePath(filename).parts[-1])[0].split('_')[1])
            dia = int(os.path.splitext(PurePath(filename).parts[-1])[0].split('_')[2])
            json_filename = l2_folder.joinpath(cid).with_suffix('.json')
            with open(json_filename, 'r') as json_file:
                json_data = json.load(json_file)
            # pprint.pprint(json_data)
            features = []
            for feature in features_list:
                try:
                    features.append(json_data['value'][feature])
                except KeyError:
                    features.append(np.nan)
            pid_cid_sys_dia_features.append([pid, cid, sys, dia] + features)
            # print(cid_sys_dia_features)
        df = DataFrame(pid_cid_sys_dia_features,
                       columns=['pid', 'cid', 'sys', 'dia'] + features_list)
        return df

    @staticmethod
    def get_pid_dirs(db_folder):
        pid_dirs = []
        try:
            for item in db_folder.iterdir():
                # print(item)
                if item.is_dir():
                    pid_dirs.append(item)
            return pid_dirs
        except FileNotFoundError:
            print('Invalid directory')

    def create_dataframe(self, db_folder, features_list):
        dfs = []
        result_df = DataFrame()
        for item in self.get_pid_dirs(db_folder):
            wav_folder = item.joinpath('wav_out')
            l2_folder = item.joinpath('l2')
            pid = item.parts[-1]
            dfs.append(self.create_df(wav_folder, l2_folder, pid, features_list))
        result_df = pd.concat(dfs)
        self._df = result_df

    @property
    def df(self):
        return self._df

    @df.setter
    def set_df(self, df):
        if isinstance(df, pd.DataFrame):
            self._df = df
        else:
            print('Wrong dataframe')

    def save_dataframe(self, df_filename):
        try:
            feather.write_dataframe(self._df, df_filename)
        except Exception as e:
            logging.error(e)

    def read_dataframe(self, df_filename):
        try:
            self._df = feather.read_dataframe(df_filename)
        except Exception as e:
            logging.error(e)

    def get_pid_ncid_dict(self, threshold_n=20):
        pid_list = self._df['pid'].unique()
        print(pid_list)
        cid_count_list = []
        for pid_item in pid_list:
            temp_df = self._df.query('pid == @pid_item')
            cid_count_list.append(len(temp_df))
        pid_ncid_dict = {pid_list[i]: cid_count_list[i] for i in range(len(pid_list))}
        print(pid_ncid_dict)
        keys_to_delete = []
        for key, value in pid_ncid_dict.items():
            if value < threshold_n:
                keys_to_delete.append(key)
        for key in keys_to_delete:
            pid_ncid_dict.pop(key)
        print(pid_ncid_dict)
        return pid_ncid_dict

    def truncate_df(self, cid_quantity_tostore=20, do_filter=True):
        if do_filter:
            indexes_to_drop = []
        pid_ncid_dict = self.get_pid_ncid_dict()
        for key in pid_ncid_dict:
            temp_df = self._df.query('pid == @key')
            if len(temp_df) > cid_quantity_tostore:
                remove_n = len(temp_df) - cid_quantity_tostore
                drop_indexes = np.random.choice(temp_df.index, remove_n, replace=False)
                indexes_to_drop.append(drop_indexes)
        # print(len(np.concatenate(indexes_to_drop)))
        cleared_df = self._df.drop(np.concatenate(indexes_to_drop))
        print(len(pid_ncid_dict))
        print(cleared_df)

    def get_pid_df(self, pid):
        if pid not in self._df['pid'].unique():
            raise ValueError("No such pid in db: {0}, expected one of {1}".format(pid, self._df['pid'].unique()))
        else:
            return self._df.query('pid == @pid')

In [3]:
dataframe_filename = os.path.join('/content/drive/My Drive/Colab/l2_ML/all_json_df.file')
inst = Db()
inst.read_dataframe(dataframe_filename)
df = inst.df
print(df.columns)
print(df.head())
# df.describe()
df.drop(['MB1'], axis=1, inplace=True)
df.dropna(axis=0, how='any', inplace=True)
# df.describe()
index_0s = df.query('HR ==0').index
index_dia_min = df.query('dia < 40').index
index_dia_max = df.query('dia > 120').index
df.drop(index_0s, axis=0, inplace=True)
df.drop(index_dia_min, axis=0, inplace=True)
df.drop(index_dia_max, axis=0, inplace=True)

Index(['pid', 'cid', 'sys', 'dia', 'HR', 'MAB2', 'MAD3P', 'MADP', 'MASEP',
       'MASEPMax', 'MASRP', 'MB0', 'MB1', 'MB2', 'MD3P', 'MDP', 'MFDP',
       'MFSEP', 'MFSRP', 'MPI', 'MRR', 'MS4S1', 'MS5S2', 'MSEP', 'MSEPMax',
       'MSNR', 'MSRP', 'MWaveType', 'MdD3P', 'MdDP', 'MdSEP', 'MdSEPMax',
       'MdSRP', 'BR'],
      dtype='object')
    pid    cid  sys  dia     HR  ...   MdDP  MdSEP  MdSEPMax  MdSRP     BR
0  1003  46667  139   96  73.80  ...  818.0  816.0     825.0  819.0  0.327
1  1003  47085  123   87  69.28  ...  867.0  864.0     879.0  869.0  0.213
2  1003  47089  114   76  75.28  ...  797.0  797.0     797.0  797.0  0.423
3  1003  47164  120   63  74.91  ...  800.0  801.0     813.0  802.0  0.230
4  1003  47238  119   83  67.95  ...  890.0  883.0     890.0  887.0  0.253

[5 rows x 34 columns]


In [4]:
def sys_zone(value):
    if 110 <= value <= 130:
        return 'Normal'
    elif value <= 100:
        return 'Low'
    elif value >= 140:
        return 'High'
    else:
        return np.nan
def dia_zone(value):
    if 70 <= value <= 90:
        return 'Normal'
    elif value <= 60:
        return 'Low'
    elif value >= 100:
        return 'Low'
    else:
        return np.nan
# print(df.loc[df['dia']==50])
df['sys_labels'] = df['sys'].apply(sys_zone)
df['dia_labels'] = df['dia'].apply(dia_zone)
df_spaced = df.dropna()
df_spaced.describe()
df_4994 = df_spaced.query('pid=="4994"')
df_4994.describe()
print(df_4994)

       pid    cid  sys  dia  ...  MdSRP     BR  sys_labels  dia_labels
1324  4994  45888  120   80  ...  679.0  0.167      Normal      Normal
1325  4994  46277  140   85  ...  698.0  0.243        High      Normal
1326  4994  46314  142   74  ...  651.0  0.230        High      Normal
1328  4994  46456  146   81  ...  645.0  0.233        High      Normal
1329  4994  46467  143   81  ...  679.0  0.210        High      Normal
1331  4994  46638  142   76  ...  685.0  0.217        High      Normal
1332  4994  46682  143   82  ...  616.0  0.230        High      Normal
1334  4994  46752  124   73  ...  726.0  0.223      Normal      Normal
1335  4994  46771  140   74  ...  599.0  0.243        High      Normal
1336  4994  46790  141   78  ...  614.0  0.270        High      Normal
1337  4994  46799  141   86  ...  547.0  0.310        High      Normal
1338  4994  46855  111   70  ...  746.0  0.250      Normal      Normal
1340  4994  46886  110   75  ...  791.0  0.206      Normal      Normal
1343  

In [5]:
print(df_4994.columns)
df_4994.drop(['pid', 'cid', 'dia_labels'], axis=1, inplace=True)
# ax = sns.scatterplot(x="HR", y="sys_labels", data=df_4994)
# sns.pairplot(df_4994, hue="sys_labels")

Index(['pid', 'cid', 'sys', 'dia', 'HR', 'MAB2', 'MAD3P', 'MADP', 'MASEP',
       'MASEPMax', 'MASRP', 'MB0', 'MB2', 'MD3P', 'MDP', 'MFDP', 'MFSEP',
       'MFSRP', 'MPI', 'MRR', 'MS4S1', 'MS5S2', 'MSEP', 'MSEPMax', 'MSNR',
       'MSRP', 'MWaveType', 'MdD3P', 'MdDP', 'MdSEP', 'MdSEPMax', 'MdSRP',
       'BR', 'sys_labels', 'dia_labels'],
      dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [0]:
# sns.pairplot(df_4994, hue="sys_labels")

In [7]:
result = df_4994.corr()
print(type(result))
print(result)

<class 'pandas.core.frame.DataFrame'>
                sys       dia        HR  ...  MdSEPMax     MdSRP        BR
sys        1.000000  0.330680  0.481034  ... -0.467542 -0.482838  0.180842
dia        0.330680  1.000000  0.287617  ... -0.283931 -0.297247  0.072210
HR         0.481034  0.287617  1.000000  ... -0.980176 -0.990192  0.250975
MAB2      -0.252931 -0.232562 -0.076488  ...  0.103846  0.075981  0.080722
MAD3P     -0.260020 -0.230249 -0.071437  ...  0.099328  0.070953  0.081092
MADP      -0.279992 -0.255031 -0.287793  ...  0.353249  0.286550 -0.083141
MASEP     -0.267185 -0.252547 -0.060846  ...  0.066618  0.059166  0.121070
MASEPMax  -0.253556 -0.231737 -0.071404  ...  0.097253  0.070942  0.085773
MASRP     -0.238685 -0.217204 -0.142748  ...  0.205724  0.144966 -0.009329
MB0       -0.210133 -0.084087 -0.245954  ...  0.181951  0.221887  0.029780
MB2       -0.083832 -0.093720 -0.320189  ...  0.421064  0.313079 -0.258412
MD3P      -0.060188 -0.154353 -0.188882  ...  0.160169  0.2061