In [2]:
import pandas as pd

In [3]:
# 获取数据
path = './titanic.csv'
titanic = pd.read_csv(path)
titanic.head()

Unnamed: 0,row.names,pclass,survived,name,age,embarked,home.dest,room,ticket,boat,sex
0,1,1st,1,"Allen, Miss Elisabeth Walton",29.0,Southampton,"St Louis, MO",B-5,24160 L221,2,female
1,2,1st,0,"Allison, Miss Helen Loraine",2.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,,female
2,3,1st,0,"Allison, Mr Hudson Joshua Creighton",30.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,(135),male
3,4,1st,0,"Allison, Mrs Hudson J.C. (Bessie Waldo Daniels)",25.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,,female
4,5,1st,1,"Allison, Master Hudson Trevor",0.9167,Southampton,"Montreal, PQ / Chesterville, ON",C22,,11,male


In [4]:
# 筛选特征值和目标值
x = titanic[['pclass', 'age', 'sex' ]]
y = titanic['survived']

In [5]:
print('x.head(): \n', x.head())
print('y.head(): \n', y.head())

x.head(): 
   pclass      age     sex
0    1st  29.0000  female
1    1st   2.0000  female
2    1st  30.0000    male
3    1st  25.0000  female
4    1st   0.9167    male
y.head(): 
 0    1
1    0
2    0
3    0
4    1
Name: survived, dtype: int64


In [6]:
# 数据处理：缺失值处理
x['age'].fillna(x['age'].mean(), inplace=True)
x

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  x['age'].fillna(x['age'].mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['age'].fillna(x['age'].mean(), inplace=True)


Unnamed: 0,pclass,age,sex
0,1st,29.000000,female
1,1st,2.000000,female
2,1st,30.000000,male
3,1st,25.000000,female
4,1st,0.916700,male
...,...,...,...
1308,3rd,31.194181,male
1309,3rd,31.194181,male
1310,3rd,31.194181,male
1311,3rd,31.194181,female


In [7]:
# 转换成字典
x = x.to_dict(orient='records')
x

[{'pclass': '1st', 'age': 29.0, 'sex': 'female'},
 {'pclass': '1st', 'age': 2.0, 'sex': 'female'},
 {'pclass': '1st', 'age': 30.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 25.0, 'sex': 'female'},
 {'pclass': '1st', 'age': 0.9167, 'sex': 'male'},
 {'pclass': '1st', 'age': 47.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 63.0, 'sex': 'female'},
 {'pclass': '1st', 'age': 39.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 58.0, 'sex': 'female'},
 {'pclass': '1st', 'age': 71.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 47.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 19.0, 'sex': 'female'},
 {'pclass': '1st', 'age': 31.19418104265403, 'sex': 'female'},
 {'pclass': '1st', 'age': 31.19418104265403, 'sex': 'male'},
 {'pclass': '1st', 'age': 31.19418104265403, 'sex': 'male'},
 {'pclass': '1st', 'age': 50.0, 'sex': 'female'},
 {'pclass': '1st', 'age': 24.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 36.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 37.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 47.0, 

In [8]:
# 数据集划分
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier, export_graphviz
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=22)

In [9]:
# 字典特征抽取
transfer = DictVectorizer()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

In [10]:
# 决策树预估器
estimator = DecisionTreeClassifier(criterion='entropy', max_depth=8)
estimator.fit(x_train, y_train)
# 模型评估
y_predict = estimator.predict(x_test)
print("Predicted labels:\n", y_predict) 
print("直接比对真实值和预测值\n", y_test == y_predict)
score = estimator.score(x_test, y_test)
print("Accuracy:\n", score)
# 可视化决策树
export_graphviz(estimator, out_file='titanic_tree.dot', feature_names=transfer.get_feature_names_out())

Predicted labels:
 [0 0 0 0 1 1 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1
 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 1
 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0
 0 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0
 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0
 0 1 1 1 0 0 1 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1]
直接比对真实值和预测值
 831      True
261      True
1210     True
1155     True
255      True
        ...  
1146     True
1125    False
386      True
1025    False
337      True
Name: survived, Length: 329, dtype: bool
Accuracy:
 0.7811550151975684


In [12]:
# 随机深林
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [13]:
estimator = RandomForestClassifier()
# 加入网格搜索
# Define the parameter grid for GridSearchCV
param_grid = {
    "n_estimators": [120, 200, 300, 500, 800, 1200],
    "max_depth": [5, 8, 15, 25, 30],
}
# Create a GridSearchCV object
estimator = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=3)
# Fit the model to the training data
estimator.fit(x_train, y_train)
# Predict the labels of the test set
y_predict = estimator.predict(x_test)
print("Predicted labels:\n", y_predict)
print("直接比对真实值和预测值\n", y_test == y_predict)
# Calculate the accuracy of the model
score = estimator.score(x_test, y_test)
print("Accuracy:\n", score)
# Best parameters, score, estimator, and cv_results
print("Best parameters:\n", estimator.best_params_)
print("Best score:\n", estimator.best_score_)
print("Best estimator:\n", estimator.best_estimator_)
print("CV results:\n", estimator.cv_results_)

Predicted labels:
 [0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1
 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 1
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0
 0 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0
 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0
 0 1 1 1 0 0 1 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1]
直接比对真实值和预测值
 831      True
261      True
1210     True
1155     True
255      True
        ...  
1146     True
1125    False
386      True
1025    False
337      True
Name: survived, Length: 329, dtype: bool
Accuracy:
 0.7872340425531915
Best parameters:
 {'max_depth': 5, 'n_estimators': 200}
Best score:
 0.8363