In [16]:
from sklearn.preprocessing import LabelEncoder
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier

## 1 데이터 전처리

In [17]:
def preprocess(df):
    df = df.loc[:,:'embarked']
    
    le = LabelEncoder()
    sex0 = le.fit_transform(df.sex)
    embarked0 = le.fit_transform(df.embarked)
    df['sex']=sex0
    df['embarked']=embarked0
    
    df = df.fillna(df.median())
    
    X = df.loc[:,'pclass':]
    y = df.survived
    return X, y

## 2 QDA, LDA

In [28]:
def qda_titanic(X, y):
    qda = QuadraticDiscriminantAnalysis(store_covariances=True).fit(X, y)
    y_pred = qda.predict(X)
    labels = ['dead', 'alive']

    return classification_report(y, y_pred, target_names=labels), confusion_matrix(y, y_pred)

In [19]:
def lda_titanic(X, y):
    lda = LinearDiscriminantAnalysis(n_components=2, store_covariance=True).fit(X,y)
    y_pred = lda.predict(X)
    labels = ['dead','alive']
    
    return classification_report(y, y_pred, target_names=labels), confusion_matrix(y, y_pred)    

## 3 Naive bayes

In [37]:
def gaussian_nb(X,y):
    clf = GaussianNB().fit(X,y)
    y_pred = clf.predict(X)
    labels = ['dead','alive']
    
    return classification_report(y, y_pred, target_names=labels), confusion_matrix(y, y_pred)        

In [21]:
def multinomial_nb(X,y):
    clf = MultinomialNB().fit(X,y)
    y_pred = clf.predict(X)
    labels = ['dead','alive']
    
    return classification_report(y, y_pred, target_names=labels), confusion_matrix(y, y_pred)        

In [22]:
def bernoulli_nb(X,y):
    clf = BernoulliNB().fit(X,y)
    y_pred = clf.predict(X)
    labels = ['dead','alive']
    
    return classification_report(y, y_pred, target_names=labels), confusion_matrix(y, y_pred)        

## 4 Decision tree

In [23]:
def tree_titanic(X,y,depth=5):
    tree = DecisionTreeClassifier(criterion='entropy', max_depth=depth, random_state=0).fit(X,y)
    y_pred = tree.predict(X)
    labels = ['dead','alive']
    
    return classification_report(y, y_pred, target_names=labels), confusion_matrix(y, y_pred)        

## 5 퍼셉트론 & 서포트벡터머신

In [25]:
df = pd.read_csv('https://raw.githubusercontent.com/datascienceschool/docker_rpython/master/data/titanic.csv')
X,y = preprocess(df)

In [29]:
a, b = qda_titanic(X,y)

In [30]:
print a

             precision    recall  f1-score   support

       dead       0.83      0.86      0.85       549
      alive       0.77      0.73      0.75       342

avg / total       0.81      0.81      0.81       891



In [31]:
a, b = gaussian_nb(X,y)

In [32]:
print a

             precision    recall  f1-score   support

       dead       0.83      0.83      0.83       549
      alive       0.73      0.73      0.73       342

avg / total       0.79      0.79      0.79       891



In [33]:
a, b = multinomial_nb(X,y)

In [34]:
print a

             precision    recall  f1-score   support

       dead       0.71      0.83      0.76       549
      alive       0.62      0.46      0.53       342

avg / total       0.68      0.69      0.68       891



In [35]:
a, b = bernoulli_nb(X,y)

In [36]:
print a

             precision    recall  f1-score   support

       dead       0.81      0.85      0.83       549
      alive       0.74      0.68      0.71       342

avg / total       0.78      0.79      0.78       891

