In [7]:
import matplotlib

matplotlib.use('TkAgg')
from matplotlib import pyplot as plt
import seaborn as sns

from collections import Counter

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, \
    ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
import numpy as np
import urllib
import pandas as pd
import os

* ## 读取数据

In [10]:
train = pd.read_csv("/Users/zac/algrithm/python/SklearnOnKaggle/train.csv", sep=",")
test = pd.read_csv("/Users/zac/algrithm/python/SklearnOnKaggle/test.csv", sep=",")
IDtest = test["PassengerId"]
train,test

(     PassengerId  Survived  Pclass  \
 0              1         0       3   
 1              2         1       1   
 2              3         1       3   
 3              4         1       1   
 4              5         0       3   
 5              6         0       3   
 6              7         0       1   
 7              8         0       3   
 8              9         1       3   
 9             10         1       2   
 10            11         1       3   
 11            12         1       1   
 12            13         0       3   
 13            14         0       3   
 14            15         0       3   
 15            16         1       2   
 16            17         0       3   
 17            18         1       2   
 18            19         0       3   
 19            20         1       3   
 20            21         0       2   
 21            22         1       2   
 22            23         1       3   
 23            24         1       1   
 24            25        

In [12]:
def detect_outliers(df, n, features):
    outlier_indices = []
    # 遍历features
    for col in features:
        # 取第一个四分位,即百分位25(nanpercentile可以去掉nan计算百分位,percentile会把nan也算进来)
        Q1 = np.nanpercentile(df[col], 25)
        # 取第三个四分位
        Q3 = np.nanpercentile(df[col], 75)
        # interquartile range ,统计四分位差
        IQR = Q3 - Q1
        # outlier step,设置间距为四分位差的1.5
        outlier_step = 1.5 * IQR
        # 找出落在(Q1-outlier_step,Q3+outlier_step)之外的,即为outlierPoint,记录下这条样本的index
        # (-6.6875,64.8125)
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step)].index
        # 把每个feature的outlier样本的index保存起来,例如对于年龄(Age)特征来说,outlier的样本记录有11个,index为33, 54, 96, 116, 280, 456, 493, 630, 672, 745, 851
        # df.loc[[33, 54, 96, 116, 280, 456, 493, 630, 672, 745, 851],:]
        # df.loc[[33, 54, 96, 116, 280, 456, 493, 630, 672, 745, 851],["Age"]]
        outlier_indices.extend(outlier_list_col)

    # select observations containing more than 2 outliers
    # 选择样本记录中,包含有超过两(n)个特征都是outlier的,如27行有3个特征是outlied,所以27行是outlier
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(k for k, v in outlier_indices.items() if v > n)
    return multiple_outliers


## 去掉包含两个特征异常的记录

In [15]:
# detect outliers from Age, SbiSp, Parch and Fare
Outliers_to_drop = detect_outliers(train, 2, ["Age", "SibSp", "Parch", "Fare"])
# show outlier rows
outlier_row = train.loc[Outliers_to_drop]
# Drop outliers
train = train.drop(Outliers_to_drop, axis=0).reset_index(drop=True)

# 混合train和test,获得整体数据集
dataset = pd.concat(objs=[train, test], axis=0).reset_index(drop=True)

## Fare特征的倾斜太大，用log处理一下

In [19]:
dataset["Fare"] = dataset["Fare"].map(lambda x: np.log(x) if x > 0 else 0)

## Embarked为null的有俩，用出现最多的"S"填充这两个

In [28]:
dataset["Embarked"].isnull().sum()

In [None]:
dataset["Embarked"] = dataset["Embarked"].fillna("S")

## 把性别转成0、1

In [44]:
dataset["Sex"] = dataset["Sex"].map({"male": 0, "female": 1})

## 填充年龄数据

In [29]:
index_NaN_age = list(dataset["Age"][dataset["Age"].isnull()].index)

In [30]:
for i in index_NaN_age:
    # 取所有年龄的均值
    age_med = dataset["Age"].median()
    # 取i列里的SibSp,Parch,Pclass,找数据集中其他和这三个"全都(&)"相等的,找出那些记录的年龄然后取均值
    age_pred = dataset["Age"][((dataset['SibSp'] == dataset.iloc[i]["SibSp"]) &
                               (dataset['Parch'] == dataset.iloc[i]["Parch"]) &
                               (dataset['Pclass'] == dataset.iloc[i]["Pclass"]))].median()
    # 如果age_pred不是Nan就直接用它,否则用全部年龄的均值
    if not np.isnan(age_pred):
        dataset['Age'].iloc[i] = age_pred
    else:
        dataset['Age'].iloc[i] = age_med

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


## 把名字按空格分割，取前半部分（即形式），然后创建新的一列Title

In [31]:
dataset_title = [i.split(",")[1].split(".")[0].strip() for i in dataset["Name"]]

In [36]:
dataset["Title"] = pd.Series(dataset_title)

0           Mr
1          Mrs
2         Miss
3          Mrs
4           Mr
5           Mr
6           Mr
7       Master
8          Mrs
9          Mrs
10        Miss
11        Miss
12          Mr
13          Mr
14        Miss
15         Mrs
16      Master
17          Mr
18         Mrs
19         Mrs
20          Mr
21          Mr
22        Miss
23          Mr
24        Miss
25         Mrs
26          Mr
27        Miss
28          Mr
29         Don
         ...  
1268        Mr
1269    Master
1270        Mr
1271       Mrs
1272    Master
1273        Mr
1274        Mr
1275       Mrs
1276        Mr
1277       Mrs
1278        Mr
1279        Mr
1280      Miss
1281        Mr
1282      Miss
1283        Mr
1284        Mr
1285        Mr
1286        Mr
1287        Mr
1288      Miss
1289      Miss
1290      Miss
1291       Mrs
1292      Miss
1293        Mr
1294      Dona
1295        Mr
1296        Mr
1297    Master
Name: Title, Length: 1298, dtype: object

#### 这里还需要把Title继续、统一

In [37]:
dataset["Title"] = dataset["Title"].replace(['Lady', 'the Countess', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
dataset["Title"] = dataset["Title"].map(
    {"Master": 0, "Miss": 1, "Ms": 1, "Mme": 1, "Mlle": 1, "Mrs": 1, "Mr": 2, "Rare": 3})
dataset["Title"] = dataset["Title"].astype(int)
dataset.drop(labels=["Name"], axis=1, inplace=True)

## 依据Parch和SibSp创建一个新的特征（家庭大小）Fsize

In [None]:
dataset["Fsize"] = dataset["SibSp"] + dataset["Parch"] + 1

In [39]:
dataset['Single'] = dataset['Fsize'].map(lambda s: 1 if s == 1 else 0)
dataset['SmallF'] = dataset['Fsize'].map(lambda s: 1 if s == 2  else 0)
dataset['MedF'] = dataset['Fsize'].map(lambda s: 1 if 3 <= s <= 4 else 0)
dataset['LargeF'] = dataset['Fsize'].map(lambda s: 1 if s >= 5 else 0)

## 类似one-hot编码

In [None]:
dataset = pd.get_dummies(dataset, columns=["Title"])
dataset = pd.get_dummies(dataset, columns=["Embarked"], prefix="Em")

## Cabin,统计发现有很多人的舱位是null,实际情况应该是把票里没有舱位的人也置为null,而不单纯是数据丢失

In [None]:
dataset["Cabin"].isnull().sum()
dataset["Cabin"].describe()

## 把没舱位的人都设为"X",其他人就取舱位号的首字母,因为ABCDEFG舱可能表示在不同的位置

In [41]:
dataset["Cabin"] = pd.Series(['X' if pd.isnull(i) else i[0] for i in dataset['Cabin']])

In [42]:
dataset = pd.get_dummies(dataset, columns=["Cabin"], prefix="Cabin")

In [45]:
dataset

Unnamed: 0,Age,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,...,Title,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_X
0,22.0,S,-0.380379,"Braund, Mr. Owen Harris",0,1,3,0,1,0.0,...,2,0,0,0,0,0,0,0,0,1
1,38.0,C,0.372137,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,1,1,1.0,...,1,0,0,1,0,0,0,0,0,0
2,26.0,S,-0.318060,"Heikkinen, Miss. Laina",0,3,3,1,0,1.0,...,1,0,0,0,0,0,0,0,0,1
3,35.0,S,0.321586,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,1,1,1.0,...,1,0,0,1,0,0,0,0,0,0
4,35.0,S,-0.307761,"Allen, Mr. William Henry",0,5,3,0,0,0.0,...,2,0,0,0,0,0,0,0,0,1
5,25.0,Q,-0.276365,"Moran, Mr. James",0,6,3,0,0,0.0,...,2,0,0,0,0,0,0,0,0,1
6,54.0,S,0.317260,"McCarthy, Mr. Timothy J",0,7,1,0,0,0.0,...,2,0,0,0,0,1,0,0,0,0
7,2.0,S,0.108419,"Palsson, Master. Gosta Leonard",1,8,3,0,3,0.0,...,0,0,0,0,0,0,0,0,0,1
8,27.0,S,-0.128286,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,9,3,1,0,1.0,...,1,0,0,0,0,0,0,0,0,1
9,14.0,C,0.202794,"Nasser, Mrs. Nicholas (Adele Achem)",0,10,2,1,1,1.0,...,1,0,0,0,0,0,0,0,0,1
