# Decision Tree (CART)  
dataset description: https://archive.ics.uci.edu/ml/datasets/abalone

In [42]:
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

# load csv file
df = pd.read_csv('./dataset/abalone.csv', header=None)
df[0] = pd.Categorical(df[0]).codes
df[8] = df[8].apply(lambda x: 0 if x> 8 else 1)


X = df[[0, 1, 2, 3, 4, 5, 6,7]]
y = df[8]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)

X_test = scaler.transform(X_test)
y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
num_correct_samples = accuracy_score(y_test, y_pred, normalize=False)
con_matrix = confusion_matrix(y_test, y_pred)

print('number of correct sample: {}'.format(num_correct_samples))
print('accuracy: {}'.format(accuracy))
print('con_matrix: {}'.format(con_matrix))



number of correct sample: 645
accuracy: 0.7715311004784688
con_matrix: [[443  91]
 [100 202]]


# Naive Bayes
dataset description: https://www.kaggle.com/c/titanic/data


In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

# Importing dataset
df = pd.read_csv("./dataset/titanic/train.csv")

df['Sex'] = pd.Categorical(df['Sex']).codes
df['Embarked'] = pd.Categorical(df['Embarked']).codes


df=df[["Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]].dropna(axis=0, how='any')

X_train, X_test = train_test_split(df, test_size=0.5)

gnb = GaussianNB()
used_features =[
    "Pclass",
    "Sex",
    "Age",
    "SibSp",
    "Parch",
    "Fare",
    "Embarked"
]

# Train classifier
gnb.fit(
    X_train[used_features].values,
    X_train["Survived"]
)
y_pred = gnb.predict(X_test[used_features])

# Print results
print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
      .format(
          X_test.shape[0],
          (X_test["Survived"] != y_pred).sum(),
          100*(1-(X_test["Survived"] != y_pred).sum()/X_test.shape[0])
))

accuracy = accuracy_score(X_test["Survived"], y_pred)
num_correct_samples = accuracy_score(X_test["Survived"], y_pred, normalize=False)

print('number of correct sample: {}'.format(num_correct_samples))
print('accuracy: {}'.format(accuracy))



Number of mislabeled points out of a total 357 points : 81, performance 77.31%
number of correct sample: 276
accuracy: 0.773109243697479
