In [1]:
%matplotlib inline
from collections import Counter
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import numpy as np
import os
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Set folder paths for outputs

In [2]:
import sys
sys.path.append("../../..")

In [3]:
sys.path

['/Users/armandoassembleia/Desktop/rsmotenc/analysis/data/car',
 '/opt/anaconda3/lib/python37.zip',
 '/opt/anaconda3/lib/python3.7',
 '/opt/anaconda3/lib/python3.7/lib-dynload',
 '',
 '/opt/anaconda3/lib/python3.7/site-packages',
 '/opt/anaconda3/lib/python3.7/site-packages/aeosa',
 '/opt/anaconda3/lib/python3.7/site-packages/locket-0.2.1-py3.7.egg',
 '/opt/anaconda3/lib/python3.7/site-packages/IPython/extensions',
 '/Users/armandoassembleia/.ipython',
 '../../..']

In [4]:
from utils.distMix import distmix
from utils.RSMOTENC import RSMOTENC
from utils.SMOTEENC import SMOTEENC
from utils.auxSamplingStudy import *

In [5]:
from analysis.data.car.config import DATA, MODELS, REPORTS, idbin, idcat, idnum

# Read the file

In [7]:
df= pd.read_csv(DATA / 'car.csv')

In [8]:
df.columns

Index(['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'target'], dtype='object')

In [9]:
df.shape

(1728, 7)

# The target column has 4 labels

In [10]:
df.target.unique()

array(['unacc', 'acc', 'vgood', 'good'], dtype=object)

#### In order to feed the data to any machine learning method, it's convenient to change strings to numeric values. So, we are going to changethese 4 target labels to values from 0 to 3

In [11]:
df['target'].replace({'unacc': 0, 'acc': 1, 'vgood': 2, 'good': 3},inplace = True)


In [12]:
df.drop_duplicates(inplace=True)

In [13]:
df.isnull().sum()

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
target      0
dtype: int64

In [14]:
df.shape

(1728, 7)

In [15]:
df.columns

Index(['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'target'], dtype='object')

# Create helper functions

In [16]:
print(df.buying.unique())
print(df.maint.unique())
print(df.doors.unique())
print(df.persons.unique())
print(df.lug_boot.unique())
print(df.safety.unique())

['vhigh' 'high' 'med' 'low']
['vhigh' 'high' 'med' 'low']
['2' '3' '4' '5more']
['2' '4' 'more']
['small' 'med' 'big']
['low' 'med' 'high']


In [17]:
Counter(df.target)

Counter({0: 1210, 1: 384, 2: 65, 3: 69})

In [18]:
## Check target label's distribution in the dataset - Class 0 is the majority class
## and there are very few observations for class 2 and 3.
## That's why we are gong to choose class 1 as the minority class for this study

In [19]:
df = df.loc[df['target'].isin([1,0])]

In [20]:
df_feature= df.drop('target',1)
df_target= df[['target']]

In [21]:
df_target.shape

(1594, 1)

In [22]:
df_target.sum(axis=0)

target    384
dtype: int64

In [23]:
df_target.sum(axis=0) / df_target.shape[0]

target    0.240903
dtype: float64

In [24]:
encoded_df = MultiColumnLabelEncoder(columns = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']).fit_transform(df_feature)


In [25]:
# Split the raw data into train and test set. Split ratio = 75:25

X_train, X_test, y_train, y_test = train_test_split(encoded_df, df_target, test_size=0.25, random_state=12)

In [26]:
for cols in encoded_df:
    print(cols, encoded_df[cols].unique())

buying [3 0 2 1]
maint [3 0 2 1]
doors [0 1 2 3]
persons [0 1 2]
lug_boot [2 1 0]
safety [1 2 0]


In [27]:
Counter(df_target.target)

Counter({0: 1210, 1: 384})

In [28]:
col_list = list(X_train.columns)
X_train.index = pd.RangeIndex(len(X_train.index))
y_train.index = pd.RangeIndex(len(y_train.index))
X_test.index = pd.RangeIndex(len(X_test.index))
y_test.index = pd.RangeIndex(len(y_test.index))

In [29]:
## Apply standard scaler on the features , so that euclidean distance calculation in SMOTE is not biased

#sc = StandardScaler()
#X_train = sc.fit_transform(X_train)
#X_test = sc.transform(X_test)

In [30]:
X_train = pd.DataFrame(X_train, index=range(X_train.shape[0]),
                          columns=col_list)
X_test = pd.DataFrame(X_test, index=range(X_test.shape[0]),
                          columns=col_list)

In [31]:
X_train.to_csv(DATA / "car_X_train.csv", index = False)
y_train.to_csv(DATA / "car_y_train.csv", index = False)
X_test.to_csv(DATA / "car_X_test.csv", index = False)
y_test.to_csv(DATA / "car_y_test.csv", index = False)