In [34]:
import pandas as pd
from sklearn.metrics import accuracy_score
import numpy as np
from collections import Counter
from sklearn.datasets import load_iris
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeClassifier, plot_tree

def entropia(y):
    entropia = 0
    classes = np.unique(y)
    
    for classe in classes:
        proporcao = np.mean(y == classe)                
        entropia -= proporcao * np.log2(proporcao)
    
    return entropia

def ganho(x, y):

    media_valores = np.mean(x)               
    maiores = x > media_valores     
    total = sum(maiores) + sum(~maiores)    
    prop_maiores = sum(maiores)/total
    prop_menores = sum(~maiores)/total
    '''print('entropia geral: ',entropia(y))
    print('prop_maiores: ', prop_maiores) 
    print('prop_menores: ', prop_menores)
    print('entropia(y[maiores]): ', entropia(y[maiores]))
    print('entropia(y[~maiores]): ', entropia(y[~maiores]))    '''
    return entropia(y) - prop_maiores*entropia(y[maiores]) - prop_menores*entropia(y[~maiores])

def caracteristica_maior_ganho(X, y):
    maior_ganho = 0
    caracteristica = 0
    for i in range(X.shape[1]):
        gain = ganho(X[:,i], y)        
        
        if gain > maior_ganho:
            maior_ganho = gain
            caracteristica = i
    
    return caracteristica

def maisFrequente(y):
    return Counter(y.flat).most_common(1)[0][0]

class MinhaArvore(BaseEstimator, ClassifierMixin):    
    def fit(self, X, y):        
        self.caracteristica = caracteristica_maior_ganho(X, y)        
        self.valor = np.mean(X[:,self.caracteristica])        
        maiores = X[:,self.caracteristica] > self.valor        
        
        if sum(maiores)>0 and sum(~maiores)>0:
            self.maiores = MinhaArvore()
            self.maiores.fit(X[maiores,:],y[maiores])            
            self.menores = MinhaArvore()
            self.menores.fit(X[~maiores,:],y[~maiores])            
        else:
            self.resposta = maisFrequente(y)
    
    def predict(self, X):
        y = np.empty((X.shape[0]))
        if hasattr(self, 'resposta'):
            y[:] = self.resposta            
        else:            
            maiores = X[:,self.caracteristica] > self.valor
            y[maiores] = self.maiores.predict(X[maiores,:])
            y[~maiores] = self.menores.predict(X[~maiores,:])
        return y

In [35]:
import pandas as pd
data = pd.read_csv('aula5_titanic/train.csv')
data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [36]:
y = data['Survived']
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [37]:
X = data.drop('Survived',axis=1)
X.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [38]:
for column in X.columns:
    print(f"{column:>12}: {len(set(X[column])):4} {X[column].dtype}")

 PassengerId:  891 int64
      Pclass:    3 int64
        Name:  891 object
         Sex:    2 object
         Age:  265 float64
       SibSp:    7 int64
       Parch:    7 int64
      Ticket:  681 object
        Fare:  248 float64
       Cabin:  148 object
    Embarked:    4 object


In [39]:
indesejadas = ['PassengerId', 'Name', 'Ticket', 'Cabin']
Xdrop = X.drop(indesejadas,axis=1)
Xdrop.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')

In [40]:
Xnum = Xdrop.select_dtypes('number')
Xnum.columns

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')

In [41]:
for column in Xnum.columns:
    print(f"{column:>12}: {sum(Xnum[column].isnull())}")

      Pclass: 0
         Age: 177
       SibSp: 0
       Parch: 0
        Fare: 0


In [42]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
XnumLimpo = imputer.fit_transform(Xnum)
XnumLimpo

array([[ 3.    , 22.    ,  1.    ,  0.    ,  7.25  ],
       [ 1.    , 38.    ,  1.    ,  0.    , 71.2833],
       [ 3.    , 26.    ,  0.    ,  0.    ,  7.925 ],
       ...,
       [ 3.    , 28.    ,  1.    ,  2.    , 23.45  ],
       [ 1.    , 26.    ,  0.    ,  0.    , 30.    ],
       [ 3.    , 32.    ,  0.    ,  0.    ,  7.75  ]])

In [43]:
Xcat = Xdrop.select_dtypes('object')
Xcat.columns

Index(['Sex', 'Embarked'], dtype='object')

In [44]:
for column in Xcat.columns:
    print(f"{column:>12}: {sum(Xcat[column].isnull())}")

         Sex: 0
    Embarked: 2


In [45]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')
print(Xcat.shape)
XcatLimpo = imputer.fit_transform(Xcat)
XcatLimpo.shape

(891, 2)


(891, 2)

In [46]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
XcatHot = encoder.fit_transform(XcatLimpo)
XcatHot.shape

(891, 5)

In [47]:
import numpy as np
Xtratado = np.c_[XnumLimpo,XcatHot.toarray()]
Xtratado.shape

(891, 10)

In [48]:
import pandas as pd
train = pd.read_csv('aula5_titanic/train.csv')
test = pd.read_csv('aula5_titanic/test.csv')
train.columns.isin(test.columns)

array([ True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True])

In [49]:
train.columns[~train.columns.isin(test.columns)]

Index(['Survived'], dtype='object')

In [50]:
from sklearn.base import BaseEstimator, TransformerMixin
class AtributosDesejados(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasIndesejadas = ['PassengerId', 'Name', 'Ticket', 'Cabin']
        return self
    def transform(self, X, y=None):
        return X.drop(self.colunasIndesejadas,axis=1)

atributosDesejados = AtributosDesejados()
Xdrop = atributosDesejados.fit_transform(X)
Xdrop.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')

In [51]:
from sklearn.base import BaseEstimator, TransformerMixin
class AtributosNumericos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasNumericas = X.select_dtypes(include='number').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasNumericas]

atributosNumericos = AtributosNumericos()
Xnum = atributosNumericos.fit_transform(Xdrop)
Xnum.columns

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')

In [52]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

pipenum = Pipeline([
    ('atributos_numericos', AtributosNumericos()),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

XnumLimpo = pipenum.fit_transform(Xnum)
XnumLimpo

array([[ 0.82737724, -0.56573646,  0.43279337, -0.47367361, -0.50244517],
       [-1.56610693,  0.66386103,  0.43279337, -0.47367361,  0.78684529],
       [ 0.82737724, -0.25833709, -0.4745452 , -0.47367361, -0.48885426],
       ...,
       [ 0.82737724, -0.1046374 ,  0.43279337,  2.00893337, -0.17626324],
       [-1.56610693, -0.25833709, -0.4745452 , -0.47367361, -0.04438104],
       [ 0.82737724,  0.20276197, -0.4745452 , -0.47367361, -0.49237783]])

In [53]:
from sklearn.base import BaseEstimator, TransformerMixin
class AtributosCategoricos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasCategoricas = X.select_dtypes(include='object').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasCategoricas]

atributosCategoricos = AtributosCategoricos()
Xcat = atributosCategoricos.fit_transform(Xdrop)
Xcat.columns

Index(['Sex', 'Embarked'], dtype='object')

In [54]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

pipecat = Pipeline([
    ('atributos_categoricos', AtributosCategoricos()),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

XcatLimpo = pipecat.fit_transform(Xdrop)
XcatLimpo

<891x5 sparse matrix of type '<class 'numpy.float64'>'
	with 1782 stored elements in Compressed Sparse Row format>

In [55]:
from sklearn.pipeline import FeatureUnion
unecaracteristicas = FeatureUnion([
    ('pipenum', pipenum),
    ('pipecat', pipecat)
])
Xtratado = unecaracteristicas.fit_transform(Xdrop)
Xtratado.shape

(891, 10)

In [56]:
from sklearn.pipeline import Pipeline

preproc = Pipeline([
    ('atributos_desejados', AtributosDesejados()),
    ('unecaracteristicas', unecaracteristicas)
])

Xtratado = preproc.fit_transform(X)
Xtratado.shape

(891, 10)

In [57]:
from sklearn.tree import DecisionTreeClassifier

pipetotal = Pipeline([
    ('preproc', preproc),
    ('arvore', DecisionTreeClassifier())
])

In [58]:
from sklearn.metrics import accuracy_score

print(X.shape)
pipetotal.fit(X,y)
ypred = pipetotal.predict(X)
accuracy_score(y, ypred)

(891, 11)


0.9797979797979798

In [59]:
from sklearn.model_selection import cross_validate
import numpy as np

print(X.shape)
scores = cross_validate(pipetotal, X, y)
scores, np.mean(scores['test_score'])

(891, 11)


({'fit_time': array([0.04497051, 0.02598405, 0.02898097, 0.02498627, 0.02498317]),
  'score_time': array([0.00699711, 0.00799799, 0.01199269, 0.00699544, 0.00799561]),
  'test_score': array([0.74860335, 0.79213483, 0.79775281, 0.73033708, 0.80337079])},
 0.7744397715146569)

In [60]:
from sklearn.model_selection import GridSearchCV

parametros = {
    'arvore__max_depth': [None] + list(range(1,20,2)),
    'arvore__criterion': ['gini', 'entropy']
}
modelo = GridSearchCV(pipetotal, param_grid=parametros)

scores = cross_validate(modelo, X, y)
scores, np.mean(scores['test_score'])

({'fit_time': array([3.52982044, 3.36392546, 3.35093045, 3.33893919, 3.3519311 ]),
  'score_time': array([0.00699592, 0.00799632, 0.01199532, 0.00799537, 0.00699639]),
  'test_score': array([0.82122905, 0.79775281, 0.81460674, 0.78089888, 0.83146067])},
 0.809189630280585)

In [61]:
from sklearn.ensemble import RandomForestClassifier

clf = Pipeline([
    ('preproc', preproc),
    ('arvore', RandomForestClassifier())
])

print('shapes: ', X.shape, test.shape)
clf.fit(X,y)
y_pred = clf.predict(test) #aqui esta a resposta (y_pred)
print('shapes: ', y.shape, ypred.shape)
print(accuracy_score(y, ypred))

print(X.shape)

shapes:  (891, 11) (418, 11)
shapes:  (891,) (891,)
0.9797979797979798
(891, 11)


In [62]:
result = test[['PassengerId']]
result['Survived'] = y_pred
print(result)
#result.to_csv('videoaula_.csv',index=False)

     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         1
4            896         1
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         1

[418 rows x 2 columns]


In [69]:
#Minha árvore

X_train = preproc.fit_transform(X)
X_train = X_train.toarray()
print(X_train.shape)

X_test = pd.read_csv('aula5_titanic/test.csv')
X_test = preproc.fit_transform(X_test)
X_test = X_test.toarray()
print(X_test.shape)

'''minha_arvore = MinhaArvore()
score_cv = cross_validate(minha_arvore, X_train, y)
print('MinhaArvore -> Acurácia com validação cruzada: ', np.mean(score_cv['test_score']))'''


modelo = RandomForestClassifier()
modelo.fit(X_train, y.to_numpy())
#score_cv = cross_validate(modelo, X_train, y.to_numpy())
ypred = modelo.predict(X_test)


result = test[['PassengerId']]
result['Survived'] = ypred.astype(int)
print(result)
result.to_csv('submit_titanic.csv',index=False)

(891, 10)
(418, 10)
     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         1
4            896         1
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         1

[418 rows x 2 columns]
