In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder,StandardScaler

from sklearn import model_selection
from sklearn.model_selection import train_test_split

from sklearn.feature_selection import mutual_info_classif, f_classif, SelectKBest
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier

from xgboost import XGBClassifier
from pytorch_tabnet.tab_model import TabNetClassifier

from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE

from src.custom_smote import CustomSMOTE
from src.experiments import tune_sampler_for_dataset, optimize_models_parameters, evaluate_models
from src.utils import csv_to_dict, TabTransformerClassifier, measure_default_times

In [2]:
car = pd.read_csv('./Datasets/car.data', header=None)

In [3]:
car.columns = ["buying", "maint", "doors", "persons", "lug_boot", "safety", "class"]
car

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


In [4]:
counts = car["class"].value_counts()
percents = car["class"].value_counts(normalize=True) * 100

for clase, count in counts.items():
    print(f"Clase {clase}: {count} instancias ({percents[clase]:.2f}%)")

Clase unacc: 1210 instancias (70.02%)
Clase acc: 384 instancias (22.22%)
Clase good: 69 instancias (3.99%)
Clase vgood: 65 instancias (3.76%)


In [5]:
for col in car.columns:
    print(car[col].value_counts())

buying
vhigh    432
high     432
med      432
low      432
Name: count, dtype: int64
maint
vhigh    432
high     432
med      432
low      432
Name: count, dtype: int64
doors
2        432
3        432
4        432
5more    432
Name: count, dtype: int64
persons
2       576
4       576
more    576
Name: count, dtype: int64
lug_boot
small    576
med      576
big      576
Name: count, dtype: int64
safety
low     576
med     576
high    576
Name: count, dtype: int64
class
unacc    1210
acc       384
good       69
vgood      65
Name: count, dtype: int64


In [None]:
map_buying_maint = {'low': 0, 'med': 1, 'high':2, 'vhigh':3}
map_doors = {'2': 2, '3': '3', '4':4, '5more':5}
map_persons = {'2': 2, '4':4, 'more':5}
map_lug = {'small': 0, 'med': 1, 'big':2}
map_safety = {'low': 0, 'med': 1, 'high':2}
map_class = {'unacc': 0, 'acc': 1, 'good': 2, 'vgood': 3}

car['buying'] = car['buying'].map(map_buying_maint).astype(int)
car['maint'] = car['maint'].map(map_buying_maint).astype(int)
car['doors'] = car['doors'].map(map_doors).astype(int)
car['persons'] = car['persons'].map(map_persons).astype(int)
car['lug_boot'] = car['lug_boot'].map(map_lug).astype(int)
car['safety'] = car['safety'].map(map_safety).astype(int)
car['class'] = car['class'].map(map_class).astype(int)

car

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,3,3,2,2,0,0,0
1,3,3,2,2,0,1,0
2,3,3,2,2,0,2,0
3,3,3,2,2,1,0,0
4,3,3,2,2,1,1,0
...,...,...,...,...,...,...,...
1723,0,0,5,5,1,1,2
1724,0,0,5,5,1,2,3
1725,0,0,5,5,2,0,0
1726,0,0,5,5,2,1,2


In [11]:
X_car = car.drop(columns=['class'])
Y_car = car['class']

In [12]:
# Dividimos en conjuntos de entrenamiento y de test (70% train, 30% test)
test_size = 0.30
seed = 100 # Semilla para reproducibilidad

# CIC_IDS2017
X_train_car, X_test_car, Y_train_car, Y_test_car = model_selection.train_test_split(X_car, Y_car, test_size=test_size, random_state=seed)