In [2]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import scipy.io
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from tqdm import tqdm

# Plots
sns.set_context(
    'talk', rc = {
        'font.size': 12.0,
        'axes.labelsize': 10.0,
        'axes.titlesize': 10.0,
        'xtick.labelsize': 10.0,
        'ytick.labelsize': 10.0,
        'legend.fontsize': 10.0,
        'legend.title_fontsize': 12.0,
        'patch.linewidth': 2.0
        }
    ) # 'paper'

data_sets = ['Train', 'Test']

In [3]:
# Check current folder
os.getcwd()

'/'

In [4]:
data_path = '/Users/ernestocolacrai/Documents/GitHub/StatisticalLearning/data/speech_dataset.mat'

try:
    data = scipy.io.loadmat(data_path)
    print(f"Data ✓")
    print(f"Data Keys: {data.keys()}")
except:
    print(f"Not found data! ({data_path})")

Data ✓
Data Keys: dict_keys(['__header__', '__version__', '__globals__', 'dataset'])


In [13]:
data_df = pd.DataFrame({
    'f1': data['dataset'][:,0],
    'f2': data['dataset'][:,1],
    'f3': data['dataset'][:,2],
    'f4': data['dataset'][:,3],
    'f5': data['dataset'][:,4],
    'label': data['dataset'][:,5]
    })

data_df.head()

Unnamed: 0,f1,f2,f3,f4,f5,label
0,0.489927,-0.451528,-1.04799,-0.598693,-0.020418,1.0
1,-0.641265,0.109245,0.29213,-0.916804,0.240223,1.0
2,0.870593,-0.459862,0.578159,0.806634,0.835248,1.0
3,-0.628439,-0.316284,1.934295,-1.427099,-0.136583,1.0
4,-0.596399,0.015938,2.043206,-1.688448,-0.948127,1.0


In [15]:
data_df.dtypes

f1       float64
f2       float64
f3       float64
f4       float64
f5       float64
label    float64
dtype: object

In [19]:
data_df.label = data_df.label.astype('int8')

In [21]:
data_df.dtypes

f1       float64
f2       float64
f3       float64
f4       float64
f5       float64
label       int8
dtype: object

In [24]:
data_df.isna().sum(axis=0)

f1       0
f2       0
f3       0
f4       0
f5       0
label    0
dtype: int64

In [35]:
# np.sqrt(np.sum((data_df.loc[0][:-1] - data_df.loc[1][:-1]) ** 2))
len(data_df)

5404

In [53]:
data_df[data_df.duplicated(subset=['f1', 'f2', 'f3', 'f4', 'f5', 'label'], keep='first')]

data_df = data_df.drop_duplicates()
len(data_df)

5395

In [54]:
(data_df.label.value_counts() / len(data_df)) * 100

1    70.769231
2    29.230769
Name: label, dtype: float64

In [92]:
np.random.seed(1)
train_set = pd.concat([
    data_df[data_df.label == 1].sample(frac=0.71),
    data_df[data_df.label == 2].sample(frac=0.29)
    ])

In [93]:
data_df.loc[list(set(data_df.index).difference(train_set.index))].label.value_counts()

2    1120
1    1107
Name: label, dtype: int64

In [95]:
100 * train_set.label.value_counts() / len(train_set)

1    85.574495
2    14.425505
Name: label, dtype: float64