#### Pandas怎样找出最影响结果的那些特征？
应用场景：
+ 机器学习的特征选择，去除无用的特征，可以提升模型效果，降低训练时间等等
+ 数据分析领域，找出收入波动的最大因素！！

#### 实例演示：titanic事件中，最影响生死的因素有哪些？

#### 1. 导入相关的包

In [30]:
import pandas as pd
import numpy as np

# 特征最影响结果的K个特征
from sklearn.feature_selection import SelectKBest

# 卡方检验，作为SelectKBest的参数
from sklearn.feature_selection import chi2

#### 2.导入titanic数据

In [31]:
df=pd.read_csv('./files/titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [32]:
df=df[['PassengerId','Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']].copy()
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,0,3,male,34.5,0,0,7.8292,Q
1,893,1,3,female,47.0,1,0,7.0,S
2,894,0,2,male,62.0,0,0,9.6875,Q
3,895,0,3,male,27.0,0,0,8.6625,S
4,896,1,3,female,22.0,1,1,12.2875,S


#### 3.数据清理和转换

##### 3.1 查看是否有空值列

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Fare         417 non-null    float64
 8   Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(2)
memory usage: 29.5+ KB


##### 3.2 给Age,Fare列填充平均值

In [34]:
df['Age']=df['Age'].fillna(df['Age'].median())
df['Fare']=df['Fare'].fillna(df['Fare'].median())
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,0,3,male,34.5,0,0,7.8292,Q
1,893,1,3,female,47.0,1,0,7.0,S
2,894,0,2,male,62.0,0,0,9.6875,Q
3,895,0,3,male,27.0,0,0,8.6625,S
4,896,1,3,female,22.0,1,1,12.2875,S


##### 3.3 将性别变成数字

In [35]:
# 性别
df.Sex.unique()

array(['male', 'female'], dtype=object)

In [36]:
df.loc[df['Sex']=='male','Sex']=0
df.loc[df['Sex']=='female','Sex']=1

In [37]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,0,3,0,34.5,0,0,7.8292,Q
1,893,1,3,1,47.0,1,0,7.0,S
2,894,0,2,0,62.0,0,0,9.6875,Q
3,895,0,3,0,27.0,0,0,8.6625,S
4,896,1,3,1,22.0,1,1,12.2875,S


##### 3.4 给Embarked列填充空值，字符串转化为数字

In [38]:
df.Embarked.unique()

array(['Q', 'S', 'C'], dtype=object)

In [39]:
# 填充空值
df['Embarked']=df['Embarked'].fillna(0)

# 字符串变成数字
df.loc[df['Embarked']=='S','Embarked']=1
df.loc[df['Embarked']=='C','Embarked']=2
df.loc[df['Embarked']=='Q','Embarked']=3

In [40]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,0,3,0,34.5,0,0,7.8292,3
1,893,1,3,1,47.0,1,0,7.0,1
2,894,0,2,0,62.0,0,0,9.6875,3
3,895,0,3,0,27.0,0,0,8.6625,1
4,896,1,3,1,22.0,1,1,12.2875,1


#### 4. 将特征列和结果列拆分开

In [41]:
y=df.pop('Survived')
X=df

In [42]:
X.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,0,34.5,0,0,7.8292,3
1,893,3,1,47.0,1,0,7.0,1
2,894,2,0,62.0,0,0,9.6875,3
3,895,3,0,27.0,0,0,8.6625,1
4,896,3,1,22.0,1,1,12.2875,1


In [43]:
y.head()

0    0
1    1
2    0
3    0
4    1
Name: Survived, dtype: int64

#### 5. 使用卡方检验选择topK的特征

In [44]:
# 选择所有的特征，目的是看到特征重要性排序
bestfeatures=SelectKBest(score_func=chi2,k=len(X.columns))
fit=bestfeatures.fit(X,y)

#### 6. 按照重要性顺序打印特征列表

In [45]:
df_scores=pd.DataFrame(fit.scores_)
df_scores

Unnamed: 0,0
0,2.988266
1,1.538844
2,266.0
3,0.146771
4,7.487395
5,25.920078
6,1348.297051
7,2.151261


In [46]:
df_columns=pd.DataFrame(X.columns)
df_columns

Unnamed: 0,0
0,PassengerId
1,Pclass
2,Sex
3,Age
4,SibSp
5,Parch
6,Fare
7,Embarked


In [49]:
# 合并两个df
df_feature_scores=pd.concat([df_columns,df_scores],axis=1)
# 列名
df_feature_scores.columns=['feature_name','Score'] # naming the dataframe columns
# 查看
df_feature_scores

Unnamed: 0,feature_name,Score
0,PassengerId,2.988266
1,Pclass,1.538844
2,Sex,266.0
3,Age,0.146771
4,SibSp,7.487395
5,Parch,25.920078
6,Fare,1348.297051
7,Embarked,2.151261


In [50]:
df_feature_scores.sort_values(by='Score',ascending=False)

Unnamed: 0,feature_name,Score
6,Fare,1348.297051
2,Sex,266.0
5,Parch,25.920078
4,SibSp,7.487395
0,PassengerId,2.988266
7,Embarked,2.151261
1,Pclass,1.538844
3,Age,0.146771
