# Aplicando algoritmo basado en Árboles
Inicialmente se toma el archivo csv que se obtuvo de la fase de tratamiento de datos

In [1]:
import pandas as pd

encoding = 'iso-8859-1'    
delimiter = ';'
filename = 'bank_balanced.csv'

bank_balanced = pd.read_csv(filename, 
                   delimiter = delimiter,
                   encoding = encoding)
bank_balanced.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,campaign,previous,poutcome,y
0,58.0,management,married,tertiary,no,2143.0,yes,no,1.0,0.0,unknown,no
1,44.0,technician,single,secondary,no,29.0,yes,no,1.0,0.0,unknown,no
2,33.0,entrepreneur,married,secondary,no,2.0,yes,yes,1.0,0.0,unknown,no
3,47.0,blue-collar,married,unknown,no,1506.0,yes,no,1.0,0.0,unknown,no
4,33.0,unknown,single,unknown,no,1.0,no,no,1.0,0.0,unknown,no


## Aplicando Transformaciones

In [2]:
print('job: ', bank_balanced['job'].unique())
print('marital: ', bank_balanced['marital'].unique())
print('education: ', bank_balanced['education'].unique())
print('default: ', bank_balanced['default'].unique())
print('housing: ', bank_balanced['housing'].unique())
print('loan: ', bank_balanced['loan'].unique())
print('poutcome: ', bank_balanced['poutcome'].unique())
print('y: ', bank_balanced['y'].unique())

job:  ['management' 'technician' 'entrepreneur' 'blue-collar' 'unknown'
 'retired' 'admin.' 'services' 'self-employed' 'unemployed' 'housemaid'
 'student' 'other']
marital:  ['married' 'single' 'divorced']
education:  ['tertiary' 'secondary' 'unknown' 'primary']
default:  ['no' 'yes']
housing:  ['yes' 'no']
loan:  ['no' 'yes']
poutcome:  ['unknown' 'failure' 'other' 'success']
y:  ['no' 'yes']


In [3]:
def bool_to_numeric(x):
    if x=='no': return 0
    if x=='yes': return 1

def job_to_numeric(x):
    if x == 'unknown': return 0
    if x == 'management': return 1
    if x == 'technician': return 2
    if x == 'entrepreneur': return 3
    if x == 'blue-collar': return 4
    if x == 'retired': return 5
    if x == 'admin.': return 6
    if x == 'services': return 7
    if x == 'self-employed': return 8
    if x == 'unemployed': return 9
    if x == 'housemaid': return 10
    if x == 'student': return 11
    if x == 'other': return 12
    
def marital_to_numeric(x):
    if x == 'married': return 0
    if x == 'single': return 1
    if x == 'divorced': return 2

def education_to_numeric(x):
    if x == 'unknown': return 0
    if x == 'primary': return 1
    if x == 'secondary': return 2
    if x == 'tertiary': return 3

def poutcome_to_numeric(x):
    if x == 'unknown': return 0
    if x == 'failure': return 1
    if x == 'success': return 2
    if x == 'other': return 3

bank_balanced['job'] = bank_balanced['job'].apply(job_to_numeric)
bank_balanced['marital'] = bank_balanced['marital'].apply(marital_to_numeric)
bank_balanced['education'] = bank_balanced['education'].apply(education_to_numeric)
bank_balanced['default'] = bank_balanced['default'].apply(bool_to_numeric)
bank_balanced['housing'] = bank_balanced['housing'].apply(bool_to_numeric)
bank_balanced['loan'] = bank_balanced['loan'].apply(bool_to_numeric)
bank_balanced['poutcome'] = bank_balanced['poutcome'].apply(poutcome_to_numeric)
bank_balanced['y'] = bank_balanced['y'].apply(bool_to_numeric)

bank_balanced.head()


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,campaign,previous,poutcome,y
0,58.0,1,0,3,0,2143.0,1,0,1.0,0.0,0,0
1,44.0,2,1,2,0,29.0,1,0,1.0,0.0,0,0
2,33.0,3,0,2,0,2.0,1,1,1.0,0.0,0,0
3,47.0,4,0,0,0,1506.0,1,0,1.0,0.0,0,0
4,33.0,0,1,0,0,1.0,0,0,1.0,0.0,0,0


In [4]:
print('job: ', bank_balanced['job'].unique())
print('marital: ', bank_balanced['marital'].unique())
print('education: ', bank_balanced['education'].unique())
print('default: ', bank_balanced['default'].unique())
print('housing: ', bank_balanced['housing'].unique())
print('loan: ', bank_balanced['loan'].unique())
print('poutcome: ', bank_balanced['poutcome'].unique())
print('y: ', bank_balanced['y'].unique())

job:  [ 1  2  3  4  0  5  6  7  8  9 10 11 12]
marital:  [0 1 2]
education:  [3 2 0 1]
default:  [0 1]
housing:  [1 0]
loan:  [0 1]
poutcome:  [0 1 3 2]
y:  [0 1]


## Creando los conjuntos de prueba
Se separan los datos en dos conjuntos. El primero es para los datos utilizados durante el entrenamiento y el restante para realizar las pruebas.

In [5]:
from sklearn.model_selection import train_test_split

bank_data = bank_balanced[['age','job','marital', 'education', 'default', 'balance', 'housing', 'loan', 'campaign', 'previous', 'poutcome']]
bank_target = bank_balanced['y']

bank_train, bank_test, y_train, y_test = train_test_split(
bank_data, 
bank_target, 
random_state=0)

### Verificando tamaños

In [6]:
print("Bank_train shape: {}".format(bank_train.shape))
print("Y_train shape: {}".format(y_train.shape))
print("Bank_test shape: {}".format(bank_test.shape))
print("Y_test shape: {}".format(y_test.shape))

Bank_train shape: (59850, 11)
Y_train shape: (59850,)
Bank_test shape: (19950, 11)
Y_test shape: (19950,)


## Aplicando algoritmo

In [10]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(bank_train, y_train)

In [23]:
y_predict = clf.predict(bank_test)
y_predict.size

19950

In [24]:
y_test.size

19950

In [26]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_predict)

array([[7956, 1865],
       [1356, 8773]], dtype=int64)