<a href="https://colab.research.google.com/github/bvm84/open_colab_ml/blob/master/Dataframe_prepare.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from IPython.display import clear_output
from google.colab import drive
drive.mount('/content/drive')
import sys
import os
import numpy as np
import pandas as pd
import seaborn as sns
import feather
import logging
from pathlib import PurePath
from pandas import DataFrame
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_regression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression
sys.path.append('/content/drive/My Drive/Colab/l2_ML/')
% matplotlib inline
np.random.seed(0)
clear_output()

In [0]:
class Db():

    def __init__(self):
        self._df = None

    @staticmethod
    def create_df(wav_folder, l2_folder, pid, features_list):
        allowed_features = ('HR', 'MAB2', 'MAD3P', 'MADP', 'MASEP', 'MASEPMax', 'MASRP', 'MB0',
                            'MB1', 'MB2', 'MD3P', 'MDP', 'MFDP', 'MFSEP', 'MFSRP', 'MPI', 'MRR',
                            'MS4S1', 'MS5S2', 'MSEP', 'MSEPMax', 'MSNR', 'MSRP', 'MWaveType',
                            'MdD3P', 'MdDP', 'MdSEP', 'MdSEPMax', 'MdSRP', 'BR')
        for feature in features_list:
            if feature not in allowed_features:
                raise ValueError("Unknown feature: {0}, expected one of {1}".format(features_list, allowed_features))

        pid_cid_sys_dia_features = []
        wav_glob_ext = '*.wav'
        for filename in glob.iglob(os.path.join(wav_folder, '') + wav_glob_ext, recursive=True):
            pid = pid
            cid = os.path.splitext(PurePath(filename).parts[-1])[0].split('_')[0]
            sys = int(os.path.splitext(PurePath(filename).parts[-1])[0].split('_')[1])
            dia = int(os.path.splitext(PurePath(filename).parts[-1])[0].split('_')[2])
            json_filename = l2_folder.joinpath(cid).with_suffix('.json')
            with open(json_filename, 'r') as json_file:
                json_data = json.load(json_file)
            # pprint.pprint(json_data)
            features = []
            for feature in features_list:
                try:
                    features.append(json_data['value'][feature])
                except KeyError:
                    features.append(np.nan)
            pid_cid_sys_dia_features.append([pid, cid, sys, dia] + features)
            # print(cid_sys_dia_features)
        df = DataFrame(pid_cid_sys_dia_features,
                       columns=['pid', 'cid', 'sys', 'dia'] + features_list)
        return df

    @staticmethod
    def get_pid_dirs(db_folder):
        pid_dirs = []
        try:
            for item in db_folder.iterdir():
                # print(item)
                if item.is_dir():
                    pid_dirs.append(item)
            return pid_dirs
        except FileNotFoundError:
            print('Invalid directory')

    def create_dataframe(self, db_folder, features_list):
        dfs = []
        result_df = DataFrame()
        for item in self.get_pid_dirs(db_folder):
            wav_folder = item.joinpath('wav_out')
            l2_folder = item.joinpath('l2')
            pid = item.parts[-1]
            dfs.append(self.create_df(wav_folder, l2_folder, pid, features_list))
        result_df = pd.concat(dfs)
        self._df = result_df

    @property
    def df(self):
        return self._df

    @df.setter
    def set_df(self, df):
        if isinstance(df, pd.DataFrame):
            self._df = df
        else:
            print('Wrong dataframe')

    def save_dataframe(self, df_filename):
        try:
            feather.write_dataframe(self._df, df_filename)
        except Exception as e:
            logging.error(e)

    def read_dataframe(self, df_filename):
        try:
            self._df = feather.read_dataframe(df_filename)
        except Exception as e:
            logging.error(e)

    def get_pid_ncid_dict(self, threshold_n=20):
        pid_list = self._df['pid'].unique()
        print(pid_list)
        cid_count_list = []
        for pid_item in pid_list:
            temp_df = self._df.query('pid == @pid_item')
            cid_count_list.append(len(temp_df))
        pid_ncid_dict = {pid_list[i]: cid_count_list[i] for i in range(len(pid_list))}
        print(pid_ncid_dict)
        keys_to_delete = []
        for key, value in pid_ncid_dict.items():
            if value < threshold_n:
                keys_to_delete.append(key)
        for key in keys_to_delete:
            pid_ncid_dict.pop(key)
        print(pid_ncid_dict)
        return pid_ncid_dict

    def truncate_df(self, cid_quantity_tostore=20, do_filter=True):
        if do_filter:
            indexes_to_drop = []
        pid_ncid_dict = self.get_pid_ncid_dict()
        for key in pid_ncid_dict:
            temp_df = self._df.query('pid == @key')
            if len(temp_df) > cid_quantity_tostore:
                remove_n = len(temp_df) - cid_quantity_tostore
                drop_indexes = np.random.choice(temp_df.index, remove_n, replace=False)
                indexes_to_drop.append(drop_indexes)
        # print(len(np.concatenate(indexes_to_drop)))
        cleared_df = self._df.drop(np.concatenate(indexes_to_drop))
        print(len(pid_ncid_dict))
        print(cleared_df)

    def get_pid_df(self, pid):
        if pid not in self._df['pid'].unique():
            raise ValueError("No such pid in db: {0}, expected one of {1}".format(pid, self._df['pid'].unique()))
        else:
            return self._df.query('pid == @pid')

In [27]:
dataframe_filename = os.path.join('/content/drive/My Drive/Colab/l2_ML/all_json_df.file')
inst = Db()
inst.read_dataframe(dataframe_filename)
df = inst.df
print(df.columns)
print(df.head())
# df.describe()
df.drop(['MB1'], axis=1, inplace=True)
df.dropna(axis=0, how='any', inplace=True)
# df.describe()
index_0s = df.query('HR ==0').index
index_dia_min = df.query('dia < 40').index
index_dia_max = df.query('dia > 120').index
df.drop(index_0s, axis=0, inplace=True)
df.drop(index_dia_min, axis=0, inplace=True)
df.drop(index_dia_max, axis=0, inplace=True)

Index(['pid', 'cid', 'sys', 'dia', 'HR', 'MAB2', 'MAD3P', 'MADP', 'MASEP',
       'MASEPMax', 'MASRP', 'MB0', 'MB1', 'MB2', 'MD3P', 'MDP', 'MFDP',
       'MFSEP', 'MFSRP', 'MPI', 'MRR', 'MS4S1', 'MS5S2', 'MSEP', 'MSEPMax',
       'MSNR', 'MSRP', 'MWaveType', 'MdD3P', 'MdDP', 'MdSEP', 'MdSEPMax',
       'MdSRP', 'BR'],
      dtype='object')
    pid    cid  sys  dia     HR  ...   MdDP  MdSEP  MdSEPMax  MdSRP     BR
0  1003  46667  139   96  73.80  ...  818.0  816.0     825.0  819.0  0.327
1  1003  47085  123   87  69.28  ...  867.0  864.0     879.0  869.0  0.213
2  1003  47089  114   76  75.28  ...  797.0  797.0     797.0  797.0  0.423
3  1003  47164  120   63  74.91  ...  800.0  801.0     813.0  802.0  0.230
4  1003  47238  119   83  67.95  ...  890.0  883.0     890.0  887.0  0.253

[5 rows x 34 columns]


In [28]:
def sys_zone(value):
    if 110 <= value <= 130:
        return 'Normal'
    elif value <= 100:
        return 'Low'
    elif value >= 140:
        return 'High'
    else:
        return np.nan
def dia_zone(value):
    if 70 <= value <= 90:
        return 'Normal'
    elif value <= 60:
        return 'Low'
    elif value >= 100:
        return 'Low'
    else:
        return np.nan
# print(df.loc[df['dia']==50])
df['sys_labels'] = df['sys'].apply(sys_zone)
df['dia_labels'] = df['dia'].apply(dia_zone)
df_spaced = df.dropna()
df_spaced.describe()
df_4994 = df_spaced.query('pid=="4994"')
df_4994.describe()
print(df_4994)

       pid    cid  sys  dia  ...  MdSRP     BR  sys_labels  dia_labels
1324  4994  45888  120   80  ...  679.0  0.167      Normal      Normal
1325  4994  46277  140   85  ...  698.0  0.243        High      Normal
1326  4994  46314  142   74  ...  651.0  0.230        High      Normal
1328  4994  46456  146   81  ...  645.0  0.233        High      Normal
1329  4994  46467  143   81  ...  679.0  0.210        High      Normal
1331  4994  46638  142   76  ...  685.0  0.217        High      Normal
1332  4994  46682  143   82  ...  616.0  0.230        High      Normal
1334  4994  46752  124   73  ...  726.0  0.223      Normal      Normal
1335  4994  46771  140   74  ...  599.0  0.243        High      Normal
1336  4994  46790  141   78  ...  614.0  0.270        High      Normal
1337  4994  46799  141   86  ...  547.0  0.310        High      Normal
1338  4994  46855  111   70  ...  746.0  0.250      Normal      Normal
1340  4994  46886  110   75  ...  791.0  0.206      Normal      Normal
1343  

In [29]:
print(df_4994.columns)
#df_4994.drop(['pid', 'cid', 'dia_labels'], axis=1, inplace=True)
# ax = sns.scatterplot(x="HR", y="sys_labels", data=df_4994)
# sns.pairplot(df_4994, hue="sys_labels")

Index(['pid', 'cid', 'sys', 'dia', 'HR', 'MAB2', 'MAD3P', 'MADP', 'MASEP',
       'MASEPMax', 'MASRP', 'MB0', 'MB2', 'MD3P', 'MDP', 'MFDP', 'MFSEP',
       'MFSRP', 'MPI', 'MRR', 'MS4S1', 'MS5S2', 'MSEP', 'MSEPMax', 'MSNR',
       'MSRP', 'MWaveType', 'MdD3P', 'MdDP', 'MdSEP', 'MdSEPMax', 'MdSRP',
       'BR', 'sys_labels', 'dia_labels'],
      dtype='object')


In [0]:
# sns.pairplot(df_4994, hue="sys_labels")

In [31]:
result = df_4994.corr()
features = df_4994.drop(['pid', 'cid', 'sys', 'dia', 'sys_labels', 'dia_labels'], axis=1)
result = features.corr()
print(type(result))
corr_threshold = 0.98
drop_list =  []
for index, row in result.iterrows():
    # print(row)
    for index_ser, value in row.iteritems():

        if index != index_ser and abs(value) > corr_threshold and index_ser not in ['HR']:
            drop_list.append(index_ser)
df_test = DataFrame({'names': drop_list})
uniques = df_test['names'].unique()
print(uniques)
cleared_features = features.drop(uniques, axis=1)
'''
for column_name, ser in result.iteritems():
    if column_name != index and 

print(cleared_features.corr())
cleared_features['sys'] = df_4994['sys']
cleared_features['sys_labels'] = df_4994['sys_labels']
sns.pairplot(cleared_features, hue="sys_labels")
'''

<class 'pandas.core.frame.DataFrame'>
['MRR' 'MdD3P' 'MdDP' 'MdSEP' 'MdSEPMax' 'MdSRP' 'MAD3P' 'MASEP'
 'MASEPMax' 'MAB2']


'\nfor column_name, ser in result.iteritems():\n    if column_name != index and \n\nprint(cleared_features.corr())\ncleared_features[\'sys\'] = df_4994[\'sys\']\ncleared_features[\'sys_labels\'] = df_4994[\'sys_labels\']\nsns.pairplot(cleared_features, hue="sys_labels")\n'

In [32]:
X = cleared_features
print(X.head())
y = df_4994['sys']
clf = SelectKBest(f_regression, k=5)
X_new = clf.fit_transform(X, y)
print(clf.get_support(indices=True))
X_new = DataFrame(data = X_new, columns = X.columns[clf.get_support(indices=True)])
print(X_new)
print(y)
# clf = LinearRegression()
clf = RandomForestRegressor(random_state=0)
cross_val_scores = cross_val_score(clf, X_new, y, cv=5, scoring='neg_mean_squared_error')
# import sklearn
# print(sorted(sklearn.metrics.SCORERS.keys()))
print(cross_val_scores)
print(cross_val_scores.mean())
params = {'n_estimators': range(10, 50, 10), 'max_depth': range(1, 12, 2), \
          'min_samples_leaf': range(1, 7, 1), 'min_samples_split': range(2, 9, 2)}
search = GridSearchCV(clf, param_grid = params, n_jobs=-1, cv=5)
search.fit(X_new, y)
cross_val_scores = cross_val_score(search.best_estimator_, X_new, y, cv=5, scoring='neg_mean_squared_error')
print(cross_val_scores)
print(cross_val_scores.mean())

         HR     MADP    MASRP   MB0  ...  MSNR   MSRP  MWaveType     BR
1324  89.55  141.716  249.529  58.0  ...  20.0  206.0        2.0  0.167
1325  85.84  481.296  554.263  58.0  ...  25.0  216.0        1.0  0.243
1326  92.02  537.754  662.637  55.0  ...  30.0  207.0        1.0  0.230
1328  92.88  194.704  243.614  57.0  ...  20.0  212.0        1.0  0.233
1329  88.50  460.682  521.475  56.0  ...  25.0  211.0        1.0  0.210

[5 rows x 19 columns]
[ 0  1  6 15 16]
        HR     MADP    MDP  MSNR   MSRP
0    89.55  141.716  321.0  20.0  206.0
1    85.84  481.296  321.0  25.0  216.0
2    92.02  537.754  311.0  30.0  207.0
3    92.88  194.704  318.0  20.0  212.0
4    88.50  460.682  312.0  25.0  211.0
5    87.59  510.256  314.0  30.0  208.0
6    97.09  191.977  311.0  20.0  203.0
7    82.87  174.312  318.0  20.0  221.0
8   100.17  462.508  310.0  30.0  192.0
9    98.36  385.840  299.0  25.0  191.0
10  109.69  397.465  315.0  30.0  196.0
11   80.65  475.427  337.0  30.0  228.0
12   75.



[-90.37347086 -74.59568277 -93.7600134  -97.73583517 -69.25589825]
-85.14418009051471




In [36]:
print(search.best_estimator_.get_params)
max_cross_val_scores = cross_val_score(search.best_estimator_, X_new, y, cv=5, scoring='max_error')
print(max_cross_val_scores)
print(max_cross_val_scores.mean())

<bound method BaseEstimator.get_params of RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=6, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=40,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)>
[-21.94153957 -18.89768726 -18.61098803 -18.5180254  -19.30833347]
-19.45531474571928
