In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
titanic=pd.read_csv(r'http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')

In [3]:
X=titanic.drop(columns=['row.names','name','survived'])
y=titanic['survived']

In [4]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 8 columns):
pclass       1313 non-null object
age          633 non-null float64
embarked     821 non-null object
home.dest    754 non-null object
room         77 non-null object
ticket       69 non-null object
boat         347 non-null object
sex          1313 non-null object
dtypes: float64(1), object(7)
memory usage: 82.1+ KB


In [5]:
X['age'].fillna(X['age'].mean(),inplace=True)

In [6]:
X.fillna('UNKNOWN',inplace=True)

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=33)

In [9]:
from sklearn.feature_extraction import DictVectorizer

In [10]:
vec=DictVectorizer()

In [11]:
X_train

Unnamed: 0,pclass,age,embarked,home.dest,room,ticket,boat,sex
1086,3rd,31.194181,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,male
12,1st,31.194181,Cherbourg,"Paris, France",B-35,17477 L69 6s,9,female
1036,3rd,31.194181,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,male
833,3rd,32.000000,Southampton,"Foresvik, Norway Portland, ND",UNKNOWN,UNKNOWN,UNKNOWN,male
1108,3rd,31.194181,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,male
562,2nd,41.000000,Cherbourg,"New York, NY",UNKNOWN,UNKNOWN,UNKNOWN,male
437,2nd,48.000000,Southampton,"Somerset / Bernardsville, NJ",UNKNOWN,UNKNOWN,9,female
663,3rd,26.000000,Southampton,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,male
669,3rd,19.000000,Southampton,England,UNKNOWN,UNKNOWN,UNKNOWN,male
507,2nd,31.194181,Southampton,"Petworth, Sussex",UNKNOWN,UNKNOWN,UNKNOWN,male


In [12]:
X_train=vec.fit_transform(X_train.to_dict(orient='record'))
X_test=vec.transform(X_test.to_dict(orient='record'))

In [13]:
len(vec.feature_names_)

474

In [14]:
# 使用决策树模型依靠所有特征进行预测，并作性能评估
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier(criterion='entropy')
print(dt.fit(X=X_train,y=y_train))
print(dt.score(X_test,y_test))

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
0.8328267477203647


In [15]:
# 筛选前20%的特征，使用相同配置的决策树模型进行预测，并且评估模型
from sklearn.feature_selection import chi2,SelectPercentile
fs=SelectPercentile(score_func=chi2,percentile=20)
X_train_fs=fs.fit_transform(X=X_train,y=y_train)
X_test_fs=fs.transform(X=X_test)
dt.fit(X=X_train_fs,y=y_train)
print(dt.score(X=X_test_fs,y=y_test))

0.8267477203647416


In [16]:
from sklearn.model_selection import cross_val_score
import numpy as np

In [17]:
percentiles=np.arange(1,100,2)
results=[]
print(percentiles.shape)

(50,)


In [18]:
for i in percentiles:
    fs=SelectPercentile(score_func=chi2,percentile=i)
    X_train_fs=fs.fit_transform(X_train,y_train)
    scores=cross_val_score(estimator=dt,X=X_train_fs,y=y_train,cv=6)
    results=np.append(results,scores.mean())
print(results)

[0.85162277 0.85873674 0.87397471 0.88519105 0.86786453 0.86889942
 0.86888702 0.86583809 0.86585056 0.86891797 0.87193588 0.87400566
 0.8699467  0.86383036 0.8709568  0.86588774 0.87397471 0.87197929
 0.87095064 0.86791418 0.86888086 0.86992191 0.86992815 0.86686066
 0.86482822 0.87500967 0.87195458 0.86987865 0.87805229 0.86380565
 0.87399326 0.87090099 0.87093209 0.86890573 0.87093832 0.86989096
 0.8770298  0.8698848  0.86989096 0.86890573 0.86990952 0.86990328
 0.86994062 0.87197321 0.86585687 0.87094448 0.86893044 0.86687305
 0.86586295 0.86686066]


In [19]:
# 找到体现最佳性能的特征筛选的百分比
# opt:(array([3], dtype=int64),)
opt=np.where(results==results.max())
# type(percentiles[opt]:<class 'numpy.ndarray'>
print("Optimal number of features "+str(percentiles[opt][0]))

Optimal number of features 7


In [20]:
import matplotlib.pyplot as plt
plt.plot(percentiles,results)
plt.xlabel='Percentiles of features'
plt.ylabel='accuracy'
plt.show()

<Figure size 640x480 with 1 Axes>

In [21]:
# 最后使用最佳筛选后的前7%的特征，利用相同setting的模型在test set上进行性能评估
from sklearn.feature_selection import chi2,SelectPercentile
fs=SelectPercentile(score_func=chi2,percentile=7)
X_train_fs=fs.fit_transform(X_train,y_train)
X_test_fs=fs.transform(X_test)

In [22]:
dt.fit(X_train_fs,y_train)
print(dt.score(X=X_test_fs,y=y_test))

0.8571428571428571
