In [1]:
# Proyecto Final del Curso Statistical Learning
# Maestría en Ciencias de Datos
# Universidad Galileo, Guatemala

# Omar Meza
# Julio 2019

In [215]:
# Import Libraries

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import time
import tensorflow as tf
from sklearn import metrics
from sklearn import tree
import sklearn
import os
from datetime import datetime
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.externals import joblib
import math

In [498]:
# Read the input csv file using pandas
input = pd.read_csv("C:\python\SL\data_titanic_proyecto.csv")

In [499]:
# For reference, these are the columns' headers with their id

#0  'PassengerId': id 
#1  'Name': name
#2  'Age': age
#3  'SibSp': siblings/Spouse
#4  'Parch': Parent/Child
#5  'Ticket':
#6  'Fare',
#7  'Cabin', 
#8  'Embarked', 
#9  'passenger_class', 
#10 'passenger_sex',
#11 'passenger_survived']

In [500]:
# This will help us to know what variables are more significant for the model
# we can see the % of not present values per feature and the % of unique values per feature

pd.DataFrame({'Null %': input.isnull().sum() * 100 / len(input), 
              'Unique values %': input.apply(lambda x: x.unique().size/x.size*100)})

Unnamed: 0,Null %,Unique values %
PassengerId,0.0,100.0
Name,0.0,100.0
Age,19.86532,9.988777
SibSp,0.0,0.785634
Parch,0.0,0.785634
Ticket,0.0,76.430976
Fare,0.0,27.833895
Cabin,77.104377,16.61055
Embarked,0.224467,0.448934
passenger_class,0.0,0.3367


We can say that, the lowest it is the Unique values %, the better the feature will be.
  And, the lowest the Null % is, the better the feature is.

From the above we can see that:

(Col0): PassengerId is always different for each passenger, so it would not be useful for predicting.

(Col1): Name is always different for each passenger, not useful for predicting

(Col7): Cabin has 77% of nulls, so can't help too much

We will need to remove the columns that are not helpful for the prediction, and for the ones that are useful, transform them in case there are Null values on them.

In [501]:
# For the features with Null % >0, fill the missing values 
# Null Age will be filled with the Age's mean in the input file
# Embarked will be marked with an 'X' for the missing values

def handle_nulls(df):
    df.Age.fillna(value=round(df.Age.mean(),1),inplace=True)
    df.Embarked.fillna(value='X',inplace=True)
    df.Age.astype(int)
    
    return(df)

In [502]:
# Transform categorical features into numerical codes
def handle_categorical(df):
    df.passenger_sex = pd.Categorical(df.passenger_sex)
    df.passenger_sex = df.passenger_sex.cat.codes
    
    df.passenger_class = pd.Categorical(df.passenger_class)
    df.passenger_class = df.passenger_class.cat.codes
    
    df.Embarked = pd.Categorical(df.Embarked)
    df.Embarked = df.Embarked.cat.codes
    
    df.passenger_survived = pd.Categorical(df.passenger_survived)
    df.passenger_survived = df.passenger_survived.cat.codes
    
    # Create categories for Age's ranges: 0-8, 9-15, 16-18, 19-25, 26-40, 41-60, 61-100
    bins = [0, 8, 15, 18, 25, 40, 60, 100]
    #names = ['1', '2', '3', '4', '5', '6', '7']
    names = [1,2,3,4,5,6,7]
    df['Age_rng'] = pd.cut(df['Age'], bins, labels = names)
    
    return(df)

In [503]:
# Feature Engineering to add more features that could describe better the target

def feature_eng(df):
    df['Family_Size'] = df['SibSp'] + df['Parch']
    df['Fare_Per_Person'] = df['Fare']/(df['Family_Size']+1)
    df['Age_Class'] = df['Age']*df['passenger_class']
    df['Age_Fare'] = df['Age']*df['Fare']
    df['Family_Size_x_class'] = df['Family_Size']*df['passenger_class']
    return(df)

In [504]:
# Remove the features that we are not take into account

def discriminate_features(df):
    features = list(df.columns.values)  #names of columns in dataframe
    features.remove('passenger_survived')  # the target variable (to predict)
    features.remove('PassengerId')         # Remove all the not useful features
    features.remove('Name')
    features.remove('Cabin')
    features.remove('Ticket')
    #features.remove('Age')
    return(features)

In [505]:
# Clean the data

# Handle nulls in feature's values, according to what feature it is
df = input
df = handle_nulls(df)

# Handle categorical values and assign a numeric value for each category
df = handle_categorical(df)

# Create some extra features to make the model more accurate
df = feature_eng(df)

# Isolate the features that are going to be included for the model calculation
features = discriminate_features(df)
#features

In [506]:
# Prepare the data that will be used to train, validate and test

# Separate input data into train and test sets, 80% for train and 20% for test
#X_train, X_test, Y_train, Y_test = train_test_split(df[features], df.passenger_survived, test_size=0.2)
X_train, X_test, Y_train, Y_test = train_test_split(df[features], input.iloc[:, 11:12].values, test_size=0.2)


# Separate train data into train and validation sets, 80% and 20% again, out of the 80% calculated above
X_train, X_validate, Y_train, Y_validate = train_test_split(X_train, Y_train, test_size=0.2)

In [507]:
# This function was taken from https://www.youtube.com/watch?v=eWswOZbSoCA
def save_experiment(df, file):
    if not os.path.isfile(file):
        # create file if it does not exist
        df.to_csv(file)
    else: 
        # add dataframe information
        df.to_csv(file, mode='a', header=False)

In [408]:
X_train[features]

Unnamed: 0,Age,SibSp,Parch,Fare,Embarked,passenger_class,passenger_sex,Age_rng,Family_Size,Fare_Per_Person,Age_Class,Age_Fare,Family_Size_x_class
82,29.70,0,0,7.7875,1,0,0,5,0,7.787500,0.0,231.288750,0
824,2.00,4,1,39.6875,2,0,1,1,5,6.614583,0.0,79.375000,0
94,59.00,0,0,7.2500,2,0,1,6,0,7.250000,0.0,427.750000,0
650,29.70,0,0,7.8958,2,0,1,5,0,7.895800,0.0,234.505260,0
367,29.70,0,0,7.2292,0,0,0,5,0,7.229200,0.0,214.707240,0
616,34.00,1,1,14.4000,2,0,1,5,2,4.800000,0.0,489.600000,0
141,22.00,0,0,7.7500,2,0,0,4,0,7.750000,0.0,170.500000,0
864,24.00,0,0,13.0000,2,1,1,4,0,13.000000,24.0,312.000000,0
174,56.00,0,0,30.6958,0,2,1,6,0,30.695800,112.0,1718.964800,0
238,19.00,0,0,10.5000,2,1,1,4,0,10.500000,19.0,199.500000,0


In [410]:
# Results for the target variable
Y_train

array([[1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
    

# SVM Algorithm

In [307]:
# Definition of the Support Vector Machine Training function

from sklearn.svm import SVC

def train_SVM(X, Y, x_validate, y_validate, C=1, Kernel='linear', Gamma=10):
    start_time = time.time()
    clf = SVC(C=C, kernel=Kernel, gamma=Gamma)
    #clf.gamma='scale'
    clf.fit(X,Y)
    
    # predict y based on validation data
    y_pred = clf.predict(x_validate)
    
     # Get some KPIs for the model
    score = clf.score(x_validate, y_validate)
    accuracy = metrics.accuracy_score(y_validate, y_pred)    
    recall = metrics.recall_score(y_validate, y_pred)
    precision = metrics.precision_score(y_validate, y_pred)
    
    
    output = {
                'kernel':[Kernel],
                'c':[C],
                'gamma':[Gamma],
                'score':[score],
                'accuracy':[accuracy],
                'recall':[recall],
                'precision':[precision]
            }
    
    df_output = pd.DataFrame(output)
    save_experiment(df_output, 'train_SVM.csv')
    joblib.dump(clf, 'python_obj_output\clf_SVM.pkl')
    print("--- Model processed in: %s seconds ---" % (time.time() - start_time))
    print(clf)
    return(y_pred, clf)
    
    
    #return(clf)

In [308]:


y_pred, SVM_model = train_SVM(X_train, Y_train, X_validate, Y_validate, C=1, Kernel='linear', Gamma=12)

  y = column_or_1d(y, warn=True)


--- Model processed in: 87.13598370552063 seconds ---
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=12, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [312]:
print("Score: ", SVM_model.score(X_validate, Y_validate))

Score:  0.8531468531468531


In [313]:
print("RESULTS:\n\n")
print("Score: ", DecisionTree_Model['score'])
print("Accuracy:", DecisionTree_Model['accuracy'])
print("Recall:", DecisionTree_Model['recall'])
print("Precision:", DecisionTree_Model['precision'])
print("\nAlgorithm prediction: ", list(y_pred))

RESULTS:


Score:  0    0.762238
Name: score, dtype: float64
Accuracy: 0    0.762238
Name: accuracy, dtype: float64
Recall: 0    0.711864
Name: recall, dtype: float64
Precision: 0    0.711864
Name: precision, dtype: float64

Algorithm prediction:  [0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1]


# Decision Tree Model

In [320]:
#from sklearn.datasets import load_iris # datos de iris
from sklearn.tree import DecisionTreeClassifier # árbol de decisión para clasificación

def train_DecisionTree(X, Y, x_validate, y_validate):
    start_time = time.time()
    clf = DecisionTreeClassifier() 
    clf.fit(X_train, Y_train) # entrenamiento del árbol
    #print(tree)
    
    # predict y based on validation data
    y_pred = clf.predict(x_validate)
    
     # Get some KPIs for the model
    score = clf.score(x_validate, y_validate)
    accuracy = metrics.accuracy_score(y_validate, y_pred)    
    recall = metrics.recall_score(y_validate, y_pred)
    precision = metrics.precision_score(y_validate, y_pred)
    
    
    output = {
                'score':[score],
                'accuracy':[accuracy],
                'recall':[recall],
                'precision':[precision]
            }
    
    df_output = pd.DataFrame(output)
    save_experiment(df_output, 'train_DecisionTree.csv')
    joblib.dump(clf, 'python_obj_output\clf_decisionTree.pkl')
    print("--- Model processed in: %s seconds ---" % (time.time() - start_time))
    print(clf)
    return(y_pred, df_output) 
    

In [321]:
y_pred, DecisionTree_Model = train_DecisionTree(X_train, Y_train, X_validate, Y_validate)

--- Model processed in: 0.012000560760498047 seconds ---
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')


In [323]:
#DecisionTree_Model.score(X_validate, Y_validate)
print("RESULTS:\n\n")
print("Score: ", DecisionTree_Model['score'])
print("Accuracy:", DecisionTree_Model['accuracy'])
print("Recall:", DecisionTree_Model['recall'])
print("Precision:", DecisionTree_Model['precision'])
print("\nAlgorithm prediction: ", list(y_pred))

RESULTS:


Score:  0    0.825175
Name: score, dtype: float64
Accuracy: 0    0.825175
Name: accuracy, dtype: float64
Recall: 0    0.79661
Name: recall, dtype: float64
Precision: 0    0.783333
Name: precision, dtype: float64

Algorithm prediction:  [0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1]


In [440]:
def train_naive_bayes(x_train, y_train, y_name):
    y_total_rows = y_train.shape[0]
    dif = y_train.unique().tolist()
    for c in dif:
        p = y_train[y_train == c].count() / y_total_rows
        x = x.append({'col': y_name, 'x': c, 'prob': p}, ignore_index=True)    
    xr = x_train.shape[0]
    x_colnames = x_train.cols
    for col in x_colnames:
        col_distinct_values = x_train[col].unique().tolist()
        for c in col_distinct_values:
            cs = x_train[col]
            p = cs[cs == c].count() / xr     
    x_colnames = x_train.cols
    ret = pd.DataFrame(cols=['col', 'x', 'prob'])
    for col in x_cols:
        cv = x_train[col].unique().tolist()
        for c in col_distinct_values:
            prob = x[(x['y']==y_value) & (x['col']==col)             
            p2 = x[(x['c']==col) & (x['x']==c)]            
            n = prob['prob'].iloc[0] * y_p
            d = p2['prob'].iloc[0]
            ret = ret.append({'col': col, 'x': c, 'prob': n/d)
    
    return ret

# Logistic Regression

In [423]:
def train_Logistic(x_train, y_train, epochs, lr=0.0001, printEach=100, batch_size_=32):
    batch_size = batch_size_   #32#x_train.shape[0]# batch_size_
    feature_count = x_train.shape[1]
    label_count = 1

    training_epochs = epochs
    learning_rate = lr
    hidden_layers = feature_count - 1
    cost_history = np.empty(shape=[1],dtype=float)

    X = tf.placeholder(tf.float32,[batch_size,feature_count])
    Y = tf.placeholder(tf.float32,[batch_size,label_count])
    
    b = tf.Variable(tf.zeros([batch_size,label_count]))
    W = tf.Variable(tf.ones([feature_count,1])) 
    
    #init = tf.contrib.layers.xavier_initializer()
    h0 = tf.layers.dense(X, hidden_layers, activation=tf.nn.relu, kernel_initializer=initializer)
    h1 = tf.layers.dense(h0, label_count, activation=None)
         
    Y_hat = tf.nn.sigmoid(tf.add(tf.matmul(X, W), b)) 
    init = tf.global_variables_initializer()
    
    #learning_rate2 = tf.compat.v1.train.exponential_decay(learning_rate,global_step,1000, 0.90, staircase=True)
    
    cross_entropy = tf.reduce_mean(-(Y*tf.log(Y_hat)+(1-Y)*tf.log(1-Y_hat)))
    cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=Y, logits=h1)
    #cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=Y, logits=Y_hat)
    cost = tf.reduce_mean(cross_entropy)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate = learning_rate).minimize(cost) 
    #optimizer =tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

    predicted = tf.nn.sigmoid(h1)
    correct_pred = tf.equal(tf.round(predicted), Y)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    
    #batch_size=32
    n_items=X_train.shape[0]

    with tf.Session() as sess:
        sess.run(init)
    
        for step in range(training_epochs + 1):
            for i in range(0, n_items, batch_size):
                #print(i,i+batch_size)
                if (i+batch_size>n_items):
                    i=n_items-batch_size
                    
                Y_send=y_train[i:i+batch_size]
                X_send=x_train[i:i+batch_size]
                #sess.run(optimizer, feed_dict={X: X_send, Y: Y_send})
                loss, _, acc = sess.run([cost, optimizer, accuracy], feed_dict={
                                         X: X_send, Y: Y_send})

            if step % printEach  == 0:
                print("Epoch: {:5}\tLoss: {:.3f}\tAcc: {:.2%}".format(step, loss, acc))
                
            cost_history = np.append(cost_history, acc)     
    return sess

In [424]:
x=train_Logistic(X_train, Y_train, 1000, 0.00001, 50)

Epoch:     0	Loss: 4.279	Acc: 78.12%
Epoch:    50	Loss: 3.538	Acc: 78.12%
Epoch:   100	Loss: 3.070	Acc: 71.88%
Epoch:   150	Loss: 3.145	Acc: 75.00%
Epoch:   200	Loss: 2.943	Acc: 75.00%
Epoch:   250	Loss: 2.124	Acc: 78.12%
Epoch:   300	Loss: 3.611	Acc: 56.25%
Epoch:   350	Loss: 3.549	Acc: 43.75%
Epoch:   400	Loss: 3.513	Acc: 37.50%
Epoch:   450	Loss: 3.356	Acc: 40.62%
Epoch:   500	Loss: 3.396	Acc: 37.50%
Epoch:   550	Loss: 3.505	Acc: 37.50%
Epoch:   600	Loss: 3.686	Acc: 34.38%
Epoch:   650	Loss: 3.650	Acc: 37.50%
Epoch:   700	Loss: 3.503	Acc: 37.50%
Epoch:   750	Loss: 4.375	Acc: 34.38%
Epoch:   800	Loss: 2.350	Acc: 68.75%
Epoch:   850	Loss: 2.142	Acc: 75.00%
Epoch:   900	Loss: 2.660	Acc: 62.50%
Epoch:   950	Loss: 4.429	Acc: 40.62%
Epoch:  1000	Loss: 2.133	Acc: 71.88%


# Ejecución de los 4 algoritmos para obtener las predicciones

In [324]:
y_pred_SVM, SVM_model = train_SVM(X_train, Y_train, X_validate, Y_validate, C=1, Kernel='linear', Gamma=12)

  y = column_or_1d(y, warn=True)


--- Model processed in: 89.86314010620117 seconds ---
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=12, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [329]:
y_pred_DecisionTree, DecisionTree_Model = train_DecisionTree(X_train, Y_train, X_validate, Y_validate)

--- Model processed in: 0.017000913619995117 seconds ---
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')


In [330]:
y_pred_SVM

array([0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1], dtype=int8)

In [331]:
y_pred_DecisionTree

array([1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1], dtype=int8)

In [432]:
temp = y_pred_SVM + y_pred_DecisionTree

In [454]:
y_pred_DecisionTree

array([1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1], dtype=int8)

In [441]:
def runAlgorithms(X, Y):
    y_pred_DecisionTree, DecisionTree_Model = train_DecisionTree(X, Y, X_validate, Y_validate)
    y_pred_SVM, SVM_model = train_SVM(X_train, Y_train, X_validate, Y_validate, C=1, Kernel='linear', Gamma=12)
    res = y_pred_SVM + y_pred_DecisionTree 
    return (res)

In [563]:
res=runAlgorithms(X_test, Y_test)

--- Model processed in: 0.012000799179077148 seconds ---
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')


  y = column_or_1d(y, warn=True)


--- Model processed in: 62.94960045814514 seconds ---
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=12, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [564]:
# Esta es el resultado final, es necesario sacar la moda
res

array([0, 1, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 0, 1, 1, 1, 0, 0, 0, 0,
       1, 2, 1, 0, 2, 0, 2, 0, 0, 1, 1, 2, 2, 0, 2, 2, 0, 0, 2, 2, 0, 2,
       0, 2, 1, 0, 2, 0, 1, 0, 0, 2, 0, 0, 2, 2, 0, 0, 2, 1, 1, 2, 1, 0,
       0, 0, 2, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 2, 0, 1, 0, 1, 0, 2, 0, 0,
       0, 2, 0, 2, 0, 0, 2, 1, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 0, 1, 2, 2,
       0, 2, 1, 2, 2, 2, 1, 1, 2, 1, 0, 0, 1, 0, 2, 2, 1, 0, 1, 1, 0, 2,
       1, 2, 2, 1, 1, 2, 1, 0, 2, 0, 0], dtype=int8)

In [567]:
# dependiendo de "la moda" ese será el resultado final de la combinación de modelos

res[res==1]=0
res[res==2]=1
res[res==3]=1
res[res==4]=1
#res

In [569]:
# Este es el resultado final de la predicción
# para los valores de prueba
res

array([0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0], dtype=int8)

# K Fold Cross Validation

Esta es una técnica utilizada cuando se cuenta con un dataset muy pequeño y que por ende tiene pocas observaciones, consiste en utilizar varias veces (k veces) la misma data, pero cada una de estas veces se realiza un split distinto de las observaciones, por lo que en cada una de las K veces que el dataset sea utilizado, tendrá diferentes observaciones para Training y Testing.

Para este proyecto podría haber sido útil dado que no se cuenta con un número tan grande de observaciones, se pudo haber realizado varios experimentos con la misma data, pero dividida de forma diferente en cada una de las veces, por lo que hubieramos podido obtener K modelos.  Finalmente hubiésemos tenido que decidir cuál de los K modelos era el que mejor describia al dataset.

# Muestreo Bootstrap
Este es útil cuando se necesita generar muchas sub muestras de la muestra principal, es decir, si se tiene una muestra total (población) de 1000 observaciones, al realizar bootstrap se indica cuántas sub muestras se quieren generar a partir de la muestra principal.  Estas muestras pueden ser con o sin reemplazo, es decir, puede haber observaciones repetidas dentro de la misma sub muestra, sin embargo, esto no afecta el resultado.  
En este proyecto no fue necesario utilizarlo, sin embargo, puede ser útil en caso se quiera generar varios modelos a partir de una misma muestra, y al final elegir el mejor de todos, resultado de hacer fit sobre cada una de las sub muestras.

# Conclusiones

* Con este proyecto se pudo poner en práctica diferentes técnicas que se vieron durante el curso, tales como Ensemble Learning y de qué forma se vuelven modelos colaborativos entre ellos para brindar un solo resultado final.
* A diferencia de las tareas de clase, en donde ya se nos daba la data lista para trabajar, durante este proyecto tuvimos que poner en práctica tareas de limpieza de data, ya que el dataset original contenía información incompleta y algunas features poco útiles para nuestro propósito, por lo que tuvimos que llenar los datos vacios y dejar atrás las featues innecesarias.
* Algo que debo mencionar aunque no esté del todo ligado al proyecto es que, este y los demás proyectos de Econometría, sumado a la carga laboral, definitivamente nos hicieron dar todo lo posible y tratar de administrar de mejor forma nuestro tiempo.  Dejo esto como algo aprendido durante este trimestre.
* Aprendi de qué forma puede guardarse persistentemente un modelo que fue generado anteriormente, sin necesidad de tener que entrenarlo nuevamente.


# Dificultades:

* Debo mencionar que al igual que durante la tarea de regresión logística, volví a tener dificultades para armar la función de entrenamiento en tensorflow para ésta, realicé varias pruebas y finalmente dejé el mejor resultado que obtuve durante los días que estuve realizando el proyecto.
* Para la realización del algoritmo de Naive Bayes, estuve un poco confundido, ya que en el video de ésta clase se realiza un ejemplo co GaussianNB de Sklearn, sin embargo, en el proyecto habia que utilizar un enfoque distinto al visto en clase, lo cual causó un poco de confusión.


# Lecciones aprendidas:

* Aprendi que debo organizar de mejor forma mi tiempo, ya que por querer abarcar un poco en varias cosas al final hacía nada en todas.  Esta definitivamente es una lección aprendida, al final logré organizar un poco mejor mi tiempo, sin embargo, si lo hubiera hecho desde el inicio no hubiera tenido mayores inconvenientes.
* Pase lo que pase, definitivamente me voy a inscribir en algún curso de Tensorflow, si bien logré defenderme en las tareas, sé que no lo domino y que por ser la base de todo, tengo que aprenderlo para no depender de Google en su aplicación.
