In [None]:
"""
需求:泰坦尼克号生存预测
流程分析:
    1. 获取数据
    2. 数据处理
        缺失值处理
        特征值-->字典类型
        特征值    目标值
    3. 数据划分
    4. 特征工程 --> 字典特征抽取
    5. 决策树预估流程
    6. 模型评估
"""

In [2]:
import pandas as pd 

In [3]:
# 1. 获取数据
train = pd.read_csv('./titanic/train.csv')
test = pd.read_csv('./titanic/test.csv')

In [19]:
# 合并数据集, 一起进行清洗,之后在重新划分
data = pd.concat([train, test], axis=0, ignore_index=True)

In [21]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [22]:
# 对数据进行描述性统计分析
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,1309.0,891.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,655.0,0.383838,2.294882,29.881138,0.498854,0.385027,33.295479
std,378.020061,0.486592,0.837836,14.413493,1.041658,0.86556,51.758668
min,1.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,0.0,2.0,21.0,0.0,0.0,7.8958
50%,655.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,982.0,1.0,3.0,39.0,1.0,0.0,31.275
max,1309.0,1.0,3.0,80.0,8.0,9.0,512.3292


In [23]:
# 查看每一列的数据类型和数据总数
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 122.8+ KB


In [144]:
# 选取特征值和目标值
x = data[['Pclass','Sex', 'Age']]
data['Survived'].fillna(0, inplace=True)
y = data['Survived']

In [145]:
x

Unnamed: 0,Pclass,Sex,Age
0,3,male,22.0
1,1,female,38.0
2,3,female,26.0
3,1,female,35.0
4,3,male,35.0
...,...,...,...
1304,3,male,
1305,1,female,39.0
1306,3,male,38.5
1307,3,male,


In [146]:
# 补足缺失值, 数值类型的话用平均值补足
x['Age'].fillna(x['Age'].mean(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['Age'].fillna(x['Age'].mean(), inplace=True)


In [147]:
# 转换成字典
def to_age(age) :
    if(age <= 6):               # 童年
        return '0' 
    elif(age > 6 and age <= 17):  # 少年
        return '1'
    elif(age > 17 and age <= 40): # 青年
        return '2'
    elif(age > 40 and age <= 65): # 中年
        return '3'
    else :                      # 老年
        return '4'


x['Age'] = x['Age'].apply(lambda t: to_age(int(t)))
x['Pclass'] = x['Pclass'].apply(lambda t: str(t)) # 字典特征必须是字符串
x = x.to_dict(orient='records')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['Age'] = x['Age'].apply(lambda t: to_age(int(t)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['Pclass'] = x['Pclass'].apply(lambda t: str(t)) # 字典特征必须是字符串


In [148]:
x

[{'Pclass': '3', 'Sex': 'male', 'Age': '2'},
 {'Pclass': '1', 'Sex': 'female', 'Age': '2'},
 {'Pclass': '3', 'Sex': 'female', 'Age': '2'},
 {'Pclass': '1', 'Sex': 'female', 'Age': '2'},
 {'Pclass': '3', 'Sex': 'male', 'Age': '2'},
 {'Pclass': '3', 'Sex': 'male', 'Age': '2'},
 {'Pclass': '1', 'Sex': 'male', 'Age': '3'},
 {'Pclass': '3', 'Sex': 'male', 'Age': '0'},
 {'Pclass': '3', 'Sex': 'female', 'Age': '2'},
 {'Pclass': '2', 'Sex': 'female', 'Age': '1'},
 {'Pclass': '3', 'Sex': 'female', 'Age': '0'},
 {'Pclass': '1', 'Sex': 'female', 'Age': '3'},
 {'Pclass': '3', 'Sex': 'male', 'Age': '2'},
 {'Pclass': '3', 'Sex': 'male', 'Age': '2'},
 {'Pclass': '3', 'Sex': 'female', 'Age': '1'},
 {'Pclass': '2', 'Sex': 'female', 'Age': '3'},
 {'Pclass': '3', 'Sex': 'male', 'Age': '0'},
 {'Pclass': '2', 'Sex': 'male', 'Age': '2'},
 {'Pclass': '3', 'Sex': 'female', 'Age': '2'},
 {'Pclass': '3', 'Sex': 'female', 'Age': '2'},
 {'Pclass': '2', 'Sex': 'male', 'Age': '2'},
 {'Pclass': '2', 'Sex': 'male', '

In [149]:
# 数据集划分
from sklearn.model_selection import train_test_split

In [150]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=23)

In [151]:
from sklearn.feature_extraction import DictVectorizer

In [152]:
# 特征工程, 字典特征抽取
transfer = DictVectorizer(sparse=False)
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

In [153]:
transfer.get_feature_names_out()

array(['Age=0', 'Age=1', 'Age=2', 'Age=3', 'Age=4', 'Pclass=1',
       'Pclass=2', 'Pclass=3', 'Sex=female', 'Sex=male'], dtype=object)

In [154]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [155]:
# 决策树算法预估
estimator = DecisionTreeClassifier(criterion='entropy' ,max_depth=8)
estimator.fit(x_train, y_train)

In [156]:
# 4) 模型评估
# 方法一: 直接对比真实值和预测值
y_predict = estimator.predict(x_test)
print('y_predict: \n', y_predict)
print('直接对比真实值和预测值: \n', y_test == y_predict)

# 方法二: 计算准确率
score = estimator.score(x_test, y_test)
print('准确率: \n', score)

# 决策树的可视化

export_graphviz(estimator, out_file='./titanic-tree.dot',
                feature_names=transfer.get_feature_names_out())


y_predict: 
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0.
 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0.
 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0.
 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0

In [157]:
# 随机森林决策树
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [158]:
estimator = RandomForestClassifier()
# 网格搜索和交叉验证 , 模型优化调整
param_grid = {
    'n_estimators': [120,200,300,500,800,1200],
    'max_depth':[5, 8, 15, 25, 30]
}
estimator = GridSearchCV(estimator, param_grid=param_grid, cv=3)
estimator.fit(x_train, y_train)

# 5) 模型评估
# 方法一: 直接对比真实值和预测值
y_predict = estimator.predict(x_test)
print('y_predict: \n', y_predict)
print('直接对比真实值和预测值: \n', y_test == y_predict)

# 方法二: 计算准确率
score = estimator.score(x_test, y_test)
print('准确率: \n', score)

print('最佳参数:\n', estimator.best_params_)
print('最佳结果:\n', estimator.best_score_)
print('最佳估计器:\n', estimator.best_estimator_)
print('交叉验证结果:\n', estimator.cv_results_)


y_predict: 
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0.
 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0.
 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0.
 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0