In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from tpot import TPOTClassifier

In [2]:
# 数据加载
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
# 使用平均年龄来填充年龄中nan值
train_data['Age'].fillna(train_data['Age'].mean(),inplace=True)
test_data['Age'].fillna(test_data['Age'].mean(),inplace=True)

In [4]:
# 使用票价的均值填充票价中的nan值
train_data['Fare'].fillna(train_data['Fare'].mean(), inplace=True)
test_data['Fare'].fillna(test_data['Fare'].mean(), inplace=True)

In [5]:
print(train_data['Embarked'].value_counts())
# 使用登录最多的港口来填充登录港口的nan值
train_data['Embarked'].fillna('S',inplace=True)
test_data['Embarked'].fillna('S',inplace=True)
test_data['Embarked'].replace('S',1,inplace=True)
test_data['Embarked'].replace('C',2,inplace=True)
test_data['Embarked'].replace('Q',3,inplace=True)
train_data['Embarked'].replace('S',1,inplace=True)
train_data['Embarked'].replace('C',2,inplace=True)
train_data['Embarked'].replace('Q',3,inplace=True)

S    644
C    168
Q     77
Name: Embarked, dtype: int64


In [6]:
# 特征选择
features = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
train_data['Sex'].replace('female',1,inplace=True)
train_data['Sex'].replace('male',0,inplace=True)
test_data['Sex'].replace('female',1,inplace=True)
test_data['Sex'].replace('male',0,inplace=True)
train_features = train_data[features]
train_labels = train_data['Survived']
test_features = test_data[features]
print('特征值')
print(train_features)

特征值
     Pclass  Sex        Age  SibSp  Parch     Fare  Embarked
0         3    0  22.000000      1      0   7.2500         1
1         1    1  38.000000      1      0  71.2833         2
2         3    1  26.000000      0      0   7.9250         1
3         1    1  35.000000      1      0  53.1000         1
4         3    0  35.000000      0      0   8.0500         1
..      ...  ...        ...    ...    ...      ...       ...
886       2    0  27.000000      0      0  13.0000         1
887       1    1  19.000000      0      0  30.0000         1
888       3    1  29.699118      1      2  23.4500         1
889       1    0  26.000000      0      0  30.0000         2
890       3    0  32.000000      0      0   7.7500         3

[891 rows x 7 columns]


In [7]:
tpot = TPOTClassifier(generations=5,population_size=20,verbosity=2)
tpot.fit(train_features.astype(np.float64),train_labels.astype(np.float64))
predict_y = tpot.predict(test_features)
print(predict_y)

Optimization Progress:  33%|███▎      | 40/120 [00:13<00:41,  1.92pipeline/s]Generation 1 - Current best internal CV score: 0.824938798568828
Optimization Progress:  49%|████▉     | 59/120 [00:20<00:42,  1.44pipeline/s]Generation 2 - Current best internal CV score: 0.835044881049526
Optimization Progress:  67%|██████▋   | 80/120 [00:32<00:23,  1.69pipeline/s]Generation 3 - Current best internal CV score: 0.835044881049526
Optimization Progress:  83%|████████▎ | 100/120 [00:46<00:10,  1.86pipeline/s]Generation 4 - Current best internal CV score: 0.835044881049526
Optimization Progress: 100%|██████████| 120/120 [00:59<00:00,  1.58pipeline/s]Generation 5 - Current best internal CV score: 0.835044881049526
Optimization Progress: 100%|██████████| 120/120 [00:59<00:00,  1.58pipeline/s]
Best pipeline:GradientBoostingClassifier(input_matrix, learning_rate=0.1, max_depth=10, max_features=0.5, min_samples_leaf=12, min_samples_split=5, n_estimators=100, subsample=0.7500000000000001)
[0. 0. 0. 0. 

In [36]:
# test_data.(pred_label)
result = test_data.append(pd.DataFrame({'Survived':predict_y}),sort=False)
result.to_csv('result.csv')