In [1]:
import pandas as pd 
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
from sklearn.preprocessing import StandardScaler

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score,classification_report, precision_recall_curve, confusion_matrix

In [5]:
from sklearn.neighbors import KNeighborsClassifier

In [6]:
train = pd.read_csv("Processed_train.csv")
test = pd.read_csv("Processed_test.csv")

In [7]:
X = train.drop(["Survived"],axis=1)
y = train["Survived"]

In [8]:
# 划分测试集
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=.33,random_state = 0)

In [9]:
sc = StandardScaler()

In [10]:
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
test = sc.transform(test)

In [18]:
# Logistic Regression
logreg = LogisticRegression()
logreg.fit(x_train,y_train)
y_pred = logreg.predict(x_test)

In [67]:
accurcies = cross_val_score(estimator=logreg,X = x_train,y = y_train,cv=10)
logreg_accy = accurcies.mean()
print(logreg_accy)
round(logreg_accy,3)

0.815637677133


0.81599999999999995

In [12]:
from sklearn.model_selection import GridSearchCV

In [54]:
C_vals = [0.099,0.1,0.15,0.2,0.25,0.5,12,13,14,15,16,16.5,17,17.5,18]
penalties = ['l1','l2']
param = {
    "penalty":penalties,
    "C":C_vals
}
grid_search = GridSearchCV(estimator=logreg,param_grid=param,scoring="accuracy",cv=10)

In [55]:
grid_search = grid_search.fit(x_train,y_train)

In [56]:
grid_search.best_params_

{'C': 0.2, 'penalty': 'l1'}

In [57]:
grid_search.best_score_

0.81711409395973156

In [58]:
logreg_grid = grid_search.best_estimator_

In [60]:
logreg_accy = logreg_grid.score(x_test,y_test)
logreg_accy

0.80338983050847457

In [64]:
print(classification_report(y_test,y_pred,labels=logreg_grid.classes_))

             precision    recall  f1-score   support

          0       0.85      0.83      0.84       184
          1       0.73      0.77      0.75       111

avg / total       0.81      0.80      0.80       295



In [65]:
print(confusion_matrix(y_pred,y_test))

[[152  26]
 [ 32  85]]


In [68]:
# knn
nn_scores = []
best_prediction = [-1,-1]
for i in range(1,100):
    knn = KNeighborsClassifier(n_neighbors=i,weights="distance")
    knn.fit(x_train,y_train)
    score = accuracy_score(y_test,knn.predict(x_test))
    
    if score>best_prediction[1]:
        best_prediction = [i,score]
    nn_scores.append(score)
print(best_prediction)

[5, 0.83389830508474572]


In [13]:
# 网格
n_neighbors=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22]
weights=['uniform','distance']
leaf_size = range(1,50,5)
param = {'n_neighbors':n_neighbors, 
         'weights':weights,
         "leaf_size":leaf_size
        }
grid2 = GridSearchCV(KNeighborsClassifier(), 
                     param,
                     verbose=True, 
                     cv=StratifiedKFold(n_splits=5, random_state=15, shuffle=True),
                     scoring = "roc_auc"
                    )
grid2.fit(x_train, y_train)

Fitting 5 folds for each of 440 candidates, totalling 2200 fits


[Parallel(n_jobs=1)]: Done 2200 out of 2200 | elapsed:   41.5s finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=15, shuffle=True),
       error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], 'weights': ['uniform', 'distance'], 'leaf_size': range(1, 50, 5)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=True)

In [14]:
grid2.best_params_

{'leaf_size': 16, 'n_neighbors': 10, 'weights': 'uniform'}

In [15]:
grid2.best_score_

0.84922364166739739

In [16]:
knn_grid = grid2.best_estimator_

In [17]:
knn_accy = knn_grid.score(x_test,y_test)
knn_accy

0.83728813559322035

In [74]:
# 贝叶斯
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

gaussian = GaussianNB()
gaussian.fit(x_train, y_train)
y_pred = gaussian.predict(x_test)
gaussian_accy = round(accuracy_score(y_pred, y_test), 3)
print(gaussian_accy)

0.783


In [75]:
from sklearn.svm import SVC

svc = SVC(kernel = 'rbf', probability=True, random_state = 1, C = 3)
svc.fit(x_train, y_train)
y_pred = svc.predict(x_test)
svc_accy = round(accuracy_score(y_pred, y_test), 3)
print(svc_accy)

0.831


In [76]:
from sklearn.tree import DecisionTreeClassifier

dectree = DecisionTreeClassifier()
dectree.fit(x_train, y_train)
y_pred = dectree.predict(x_test)
dectree_accy = round(accuracy_score(y_pred, y_test), 3)
print(dectree_accy)

0.803


In [81]:
max_depth = list(range(1,30))
max_feature = ['auto']
criterion=["entropy", "gini"]

param = {'max_depth':max_depth, 
         'max_features':max_feature, 
         'criterion': criterion}
decisiontree_grid = GridSearchCV(dectree, 
                                param_grid = param, 
                                 verbose=False, 
                                 cv=StratifiedKFold(n_splits=20, random_state=15, shuffle=True))
decisiontree_grid.fit(x_train, y_train) 

GridSearchCV(cv=StratifiedKFold(n_splits=20, random_state=15, shuffle=True),
       error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'criterion': ['entropy', 'gini'], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29], 'max_features': ['auto']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=False)

In [82]:
print( decisiontree_grid.best_params_)
print (decisiontree_grid.best_score_)

{'criterion': 'entropy', 'max_depth': 5, 'max_features': 'auto'}
0.81711409396


In [83]:
decisiontree_grid = decisiontree_grid.best_estimator_

In [84]:
decisiontree_grid.score(x_test, y_test)

0.79661016949152541

In [85]:
from sklearn.ensemble import BaggingClassifier
BaggingClassifier = BaggingClassifier()
BaggingClassifier.fit(x_train, y_train)
y_pred = BaggingClassifier.predict(x_test)
bagging_accy = round(accuracy_score(y_pred, y_test), 3)
print(bagging_accy)

0.817


In [86]:
from sklearn.ensemble import RandomForestClassifier
randomforest = RandomForestClassifier(n_estimators=100,max_depth=9,min_samples_split=6, min_samples_leaf=4)
#randomforest = RandomForestClassifier(class_weight='balanced', n_jobs=-1)
randomforest.fit(x_train, y_train)
y_pred = randomforest.predict(x_test)
random_accy = round(accuracy_score(y_pred, y_test), 3)
print (random_accy)

0.814


In [95]:
from sklearn.ensemble import GradientBoostingClassifier

gradient = GradientBoostingClassifier()
gradient.fit(x_train, y_train)
y_pred = gradient.predict(x_test)
gradient_accy = round(accuracy_score(y_pred, y_test), 3)
print(gradient_accy)

0.814


In [96]:
from xgboost import XGBClassifier
XGBClassifier = XGBClassifier()
XGBClassifier.fit(x_train, y_train)
y_pred = XGBClassifier.predict(x_test)
XGBClassifier_accy = round(accuracy_score(y_pred, y_test), 3)
print(XGBClassifier_accy)

0.82


In [97]:
from sklearn.ensemble import AdaBoostClassifier
adaboost = AdaBoostClassifier()
adaboost.fit(x_train, y_train)
y_pred = adaboost.predict(x_test)
adaboost_accy = round(accuracy_score(y_pred, y_test), 3)
print(adaboost_accy)

0.814


In [98]:
from sklearn.ensemble import ExtraTreesClassifier
ExtraTreesClassifier = ExtraTreesClassifier()
ExtraTreesClassifier.fit(x_train, y_train)
y_pred = ExtraTreesClassifier.predict(x_test)
extraTree_accy = round(accuracy_score(y_pred, y_test), 3)
print(extraTree_accy)

0.8


In [99]:
from sklearn.gaussian_process import GaussianProcessClassifier
GaussianProcessClassifier = GaussianProcessClassifier()
GaussianProcessClassifier.fit(x_train, y_train)
y_pred = GaussianProcessClassifier.predict(x_test)
gau_pro_accy = round(accuracy_score(y_pred, y_test), 3)
print(gau_pro_accy)

0.82


In [100]:
from sklearn.ensemble import VotingClassifier

voting_classifier = VotingClassifier(estimators=[
    ('logreg_grid', logreg_grid),
    ('logreg',logreg), 
    ('svc', svc),
    ('random_forest', randomforest),
    ('gradient_boosting', gradient),
    ('decision_tree',dectree), 
    ('decision_tree_grid',decisiontree_grid), 
    ('knn',knn),
    ('knn_grid', knn_grid),
    ('XGB Classifier', XGBClassifier),
    ('BaggingClassifier', BaggingClassifier),
    ('ExtraTreesClassifier', ExtraTreesClassifier),
    ('gaussian',gaussian),
    ('gaussian process classifier', GaussianProcessClassifier)], voting='soft')

voting_classifier = voting_classifier.fit(x_train,y_train)

In [101]:
y_pred = voting_classifier.predict(x_test)
voting_accy = round(accuracy_score(y_pred, y_test), 3)
print(voting_accy)

0.841


In [102]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 
              'Decision Tree', 'Gradient Boosting Classifier', 'Voting Classifier', 'XGB Classifier','ExtraTrees Classifier','Bagging Classifier'],
    'Score': [svc_accy, knn_accy, logreg_accy, 
              random_accy, gaussian_accy, dectree_accy,
               gradient_accy, voting_accy, XGBClassifier_accy, extraTree_accy, bagging_accy]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
7,Voting Classifier,0.841
1,KNN,0.833898
0,Support Vector Machines,0.831
8,XGB Classifier,0.82
10,Bagging Classifier,0.817
2,Logistic Regression,0.815638
3,Random Forest,0.814
6,Gradient Boosting Classifier,0.814
5,Decision Tree,0.803
9,ExtraTrees Classifier,0.8


In [103]:

submission = pd.DataFrame({
        "PassengerId": passengerid,
        "Survived": test_prediction
    })

submission.PassengerId = submission.PassengerId.astype(int)
submission.Survived = submission.Survived.astype(int)

submission.to_csv("titanic1_submission.csv", index=False)

NameError: name 'passengerid' is not defined

In [106]:
test

array([[ 0.8470924 ,  0.72408333,  0.7893356 , ..., -0.54649926,
        -1.24378798, -0.70033994],
       [ 0.8470924 , -1.38105651,  0.7893356 , ...,  0.04463243,
        -1.24378798,  1.27787146],
       [-0.34363182,  0.72408333,  2.00182018, ..., -0.54649926,
         0.80399555, -0.70033994],
       ..., 
       [ 0.8470924 ,  0.72408333,  0.7893356 , ..., -0.54649926,
         0.80399555, -0.70033994],
       [ 0.8470924 ,  0.72408333, -0.42314898, ..., -0.54649926,
        -1.24378798, -0.70033994],
       [ 0.8470924 ,  0.72408333, -0.42314898, ...,  0.63576411,
        -1.24378798,  2.26697716]])

In [107]:
su = pd.read_csv("gender_submission.csv")

In [109]:
test_prediction = voting_classifier.predict(test)
su["Survived"] = test_prediction

In [110]:
su.to_csv("titanic1_submission.csv", index=False)

In [20]:
test_prediction1 = knn_grid.predict(test)

In [21]:
test_prediction1

array([0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0,

In [23]:
su = pd.read_csv("gender_submission.csv")
su["Survived"] = test_prediction1
su.to_csv("titanic1_submission1.csv", index=False)