In [33]:
from sklearn.preprocessing import LabelEncoder
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier

## Preprocess

In [56]:
train = pd.read_csv('../files/titanic_train.csv')
test = pd.read_csv('../files/titanic_test.csv')

train.drop(['Name','Ticket', 'Cabin'], axis=1, inplace=True)

train = pd.get_dummies(train)

train =train.fillna(train.mean())

Y = train['Survived'].values
X = train.drop(['Survived', 'PassengerId'], axis=1)

## 2 QDA, LDA

In [26]:
def qda_titanic(X, y):
    qda = QuadraticDiscriminantAnalysis(store_covariances=True).fit(X, y)
    y_pred = qda.predict(X)
    labels = ['dead', 'alive']

    return classification_report(y, y_pred, target_names=labels), confusion_matrix(y, y_pred)

In [27]:
def lda_titanic(X, y):
    lda = LinearDiscriminantAnalysis(n_components=2, store_covariance=True).fit(X,y)
    y_pred = lda.predict(X)
    labels = ['dead','alive']
    
    return classification_report(y, y_pred, target_names=labels), confusion_matrix(y, y_pred)    

## 3 Naive bayes

In [28]:
def gaussian_nb(X,y):
    clf = GaussianNB().fit(X,y)
    y_pred = clf.predict(X)
    labels = ['dead','alive']
    
    return classification_report(y, y_pred, target_names=labels), confusion_matrix(y, y_pred)        

In [29]:
def multinomial_nb(X,y):
    clf = MultinomialNB().fit(X,y)
    y_pred = clf.predict(X)
    labels = ['dead','alive']
    
    return classification_report(y, y_pred, target_names=labels), confusion_matrix(y, y_pred)        

In [30]:
def bernoulli_nb(X,y):
    clf = BernoulliNB().fit(X,y)
    y_pred = clf.predict(X)
    labels = ['dead','alive']
    
    return classification_report(y, y_pred, target_names=labels), confusion_matrix(y, y_pred)        

## 4 Decision tree

In [43]:
def tree_titanic(X,y,depth=5):
    tree = DecisionTreeClassifier(criterion='entropy', max_depth=depth, random_state=0).fit(X,y)
    y_pred = tree.predict(X)
    labels = ['dead','alive']
    
    return classification_report(y, y_pred, target_names=labels), confusion_matrix(y, y_pred)        

## 5 퍼셉트론 & 서포트벡터머신

In [62]:
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold, train_test_split
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV, Lasso
from sklearn.svm import SVR

In [65]:
def random_forest(X_train, y_train):
    rfr = RandomForestClassifier(random_state=0)
    param_grid = {'n_estimators':[500,600,700], 'max_features':[2,4,6,8], 'max_depth':[3, 5, 7, 9]}
    model = GridSearchCV(estimator=rfr, param_grid=param_grid, n_jobs=1, cv=4, scoring='accuracy')
    model.fit(X_train, y_train)
    
    print 'best score:'
    print model.best_score_
    print
    print 'best params:'
    print model.best_params_
    
def gradient_boosting(X_train, X_test):
    gbr = GradientBoostingRegressor(random_state=0)
    param_grid = {'n_estimators': [500],'max_features': [10,15],'max_depth': [6,8,10],'learning_rate': [0.05,0.1,0.15],'subsample': [0.8]}
    model = GridSearchCV(estimator=gbr, param_grid=param_grid, n_jobs=1, cv=10, scoring=RMSE)
    model.fit(X_train, y_train)
    print 'best score:'
    print model.best_score_
    print
    print 'best params:'
    print model.best_params_

def extra_trees(X_train, y_train):
    etr=ExtraTreesRegressor(random_state=0)
    param_grid = {'n_estimators': [500,600,700], 'max_features': [10,15,20]}
    model = GridSearchCV(estimator=etr, param_grid=param_grid, n_jobs=2, cv=4, scoring=RMSE)
    model.fit(X_train, y_train)
    print 'best score:'
    print model.best_score_
    print
    print 'best params:'
    print model.best_params_

def elastic_net(X_train, X_test):
    eln = ElasticNet(random_state=0)
    param_grid = {'l1_ratio': [0.3,0.4,0.5, 0.7], 'alpha':[0.0005, 0.001 ,0.01 ,0.1, 0.5, 1]}
    model = GridSearchCV(estimator=eln, param_grid=param_grid, cv=5, scoring=RMSE)
    model.fit(X_train, y_train)
    print model.best_score_
    print model.best_params_

def svr(X_train, X_test):
    svr = SVR('linear')
    param_grid = {'C':[0.1, 0.5, 1, 2, 5] }
    model = GridSearchCV(estimator=svr, param_grid=param_grid, cv=5, n_jobs=1, scoring=RMSE)
    model.fit(X_train, y_train)
    print model.best_score_
    print model.best_params_
    
def lasso(X_train, X_test):
    lasso = Lasso(random_state=0)
    model = GridSearchCV(estimator=lasso, param_grid={'alpha':[0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05]}, cv=5, scoring=RMSE)
    moel.fit(X_train, X_test)
    print model.best_score_
    print model.best_params_

In [66]:
random_forest(X,Y)

best score:
0.838383838384

best params:
{'max_features': 8, 'n_estimators': 500, 'max_depth': 9}


In [35]:
print a

             precision    recall  f1-score   support

       dead       0.63      1.00      0.77       549
      alive       0.88      0.04      0.08       342

avg / total       0.72      0.63      0.51       891



In [37]:
a, b = gaussian_nb(X,Y)

In [38]:
print a

             precision    recall  f1-score   support

       dead       0.83      0.83      0.83       549
      alive       0.72      0.73      0.73       342

avg / total       0.79      0.79      0.79       891



In [39]:
a, b = multinomial_nb(X,Y)

In [40]:
print a

             precision    recall  f1-score   support

       dead       0.71      0.80      0.75       549
      alive       0.59      0.47      0.53       342

avg / total       0.66      0.67      0.66       891



In [41]:
a, b = bernoulli_nb(X,Y)

In [42]:
print a

             precision    recall  f1-score   support

       dead       0.81      0.85      0.83       549
      alive       0.74      0.68      0.71       342

avg / total       0.78      0.79      0.78       891



In [44]:
a, b = tree_titanic(X,Y)
print a

             precision    recall  f1-score   support

       dead       0.84      0.91      0.88       549
      alive       0.84      0.73      0.78       342

avg / total       0.84      0.84      0.84       891

