In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_extraction import DictVectorizer

import pandas as pd

In [6]:
titan = pd.read_csv("./data/titanic3.csv")

# feature
x = titan[["pclass","sex","age"]]

# target
y = titan["survived"]

# impute the NA in x.age
x["age"].fillna(x["age"].mean(), inplace =True)

x["pclass"] = x["pclass"].astype(str)


# 实例化一个转换器
transfer = DictVectorizer(sparse=False)

x = transfer.fit_transform(x.to_dict(orient="records"))

print(transfer.get_feature_names_out())

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3)


['age' 'pclass=1' 'pclass=2' 'pclass=3' 'sex=female' 'sex=male']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  x["age"].fillna(x["age"].mean(), inplace =True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x["age"].fillna(x["age"].mean(), inplace =True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x["pclass"] = x["pclass"].astype(str)


In [8]:
rf = RandomForestClassifier()

param = {"n_estimators": [100, 200, 300, 500,800,1200], "max_depth": [5, 8, 15, 25, 30]}

gc = GridSearchCV(rf, param_grid=param, cv=2)

gc.fit(x_train, y_train)

print("随机森林预测的准确率为：", gc.score(x_test, y_test))

随机森林预测的准确率为： 0.7964376590330788


In [10]:
print("最佳的模型为：", gc.best_estimator_)

最佳的模型为： RandomForestClassifier(max_depth=5)


In [11]:
print("交叉验证的结果：", gc.cv_results_)

交叉验证的结果： {'mean_fit_time': array([0.04659545, 0.07672071, 0.11415911, 0.18189836, 0.29061699,
       0.43555105, 0.03829551, 0.07626152, 0.11501062, 0.19127095,
       0.30455387, 0.45834255, 0.03961504, 0.07830906, 0.11752295,
       0.19628406, 0.3120271 , 0.46926391, 0.03913248, 0.07777715,
       0.11730182, 0.19433129, 0.31142986, 0.4686445 , 0.03934765,
       0.07823348, 0.11747932, 0.19553351, 0.3117038 , 0.4664824 ]), 'std_fit_time': array([7.64048100e-03, 1.83343887e-03, 4.90689278e-03, 2.42471695e-04,
       1.28698349e-03, 4.30345535e-05, 2.90632248e-04, 1.26600266e-04,
       5.16295433e-04, 1.37007236e-03, 1.20699406e-03, 3.79657745e-03,
       5.01990318e-04, 5.92947006e-04, 7.67946243e-04, 8.52823257e-04,
       2.15494633e-03, 4.26590443e-03, 4.91499901e-04, 3.07083130e-04,
       9.65952873e-04, 1.38747692e-03, 2.04002857e-03, 1.91557407e-03,
       3.14474106e-04, 2.89440155e-04, 8.94546509e-04, 1.62148476e-03,
       3.13603878e-03, 4.14633751e-03]), 'mean_score_tim