## 数据预处理和必须的库

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python

#load packages
import sys #access to system parameters https://docs.python.org/3/library/sys.html
print("Python version: {}". format(sys.version))

import pandas as pd #collection of functions for data processing and analysis modeled after R dataframes with SQL like features
print("pandas version: {}". format(pd.__version__))

import matplotlib #collection of functions for scientific and publication-ready visualization
print("matplotlib version: {}". format(matplotlib.__version__))

import numpy as np #foundational package for scientific computing
print("NumPy version: {}". format(np.__version__))

import scipy as sp #collection of functions for scientific computing and advance mathematics
print("SciPy version: {}". format(sp.__version__)) 

import IPython
from IPython import display #pretty printing of dataframes in Jupyter notebook
print("IPython version: {}". format(IPython.__version__)) 

import sklearn #collection of machine learning algorithms
print("scikit-learn version: {}". format(sklearn.__version__))

#misc libraries
import random
from time import time
import datetime
import string

#ignore warnings
import warnings
#warnings.filterwarnings('ignore')
warnings.simplefilter("ignore")
print('-'*25)


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) 
# will list the files in the input directory

from subprocess import check_output
# unix系统中每一个进程会返回一个状态码 默认0为进程执行正常,其他为错误码
print(check_output(["ls", "./data"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

Python version: 3.7.3 (default, Mar 27 2019, 22:11:17) 
[GCC 7.3.0]
pandas version: 0.24.2
matplotlib version: 3.1.0
NumPy version: 1.16.4
SciPy version: 1.3.0
IPython version: 7.6.1
scikit-learn version: 0.21.2
-------------------------
gender_submission.csv
test.csv
train.csv



## 导入模型训练的库

In [2]:
#Common Model Algorithms
#from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis,gaussian_process
from xgboost import XGBClassifier
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.ensemble import RandomForestRegressor

import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import plot_importance

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler,QuantileTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score,cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics
from sklearn.metrics import accuracy_score

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.plotting import scatter_matrix

#Configure Visualization Defaults
#%matplotlib inline = show plots in Jupyter Notebook browser
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8
# 这个加在这里就不会出现那么多烦人的warning了
warnings.simplefilter("ignore")

## 预览数据

In [3]:
#import data from file: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
#data_raw = pd.read_csv('./data/train.csv')
#a dataset should be broken into 3 splits: train, test, and (final) validation
#the test file provided is the validation file for competition submission
#we will split the train set into train and test data in future sections
#X_val  = pd.read_csv('./data/test.csv')
# to play with our data we'll create a copy
# remember python assignment or equal passes by reference vs values, 
# so we use the copy function: https://stackoverflow.com/questions/46327494/python-pandas-dataframe-copydeep-false-vs-copydeep-true-vs
#X_train = data_raw.copy(deep = True)

#however passing by reference is convenient, because we can clean both datasets at once
# 训练集和验证集合并在一起
#data_cleaner = [X_train, X_val]

#preview data
#print(data_raw.info()) #https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.info.html
#data_raw.head() #https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.head.html
#data_raw.tail() #https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.tail.html
#data_raw.sample(10) #https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sample.html

## 生成描述统计趋势的描述性统计数据，

In [4]:
#print('Train columns with null values:\n', X_train.isnull().sum())
#print("-"*10)

#print('Test/Validation columns with null values:\n', X_val.isnull().sum())
#print("-"*10)

#data_raw.describe(include = 'all')

## 清洗数据

### 完善缺失值

In [5]:
def load_data():
    data_raw = pd.read_csv('./data/train.csv')
    X_val  = pd.read_csv('./data/test.csv')
    X_train = data_raw.copy(deep = True)
    data_cleaner = [X_train, X_val]
    return data_cleaner

def show_data_value_counts_column(column):
    data = load_data()
    X_train= data[0]
    X_val = data[1]
    print('Test/Validation value counts:\n', X_train[column].value_counts())
    print("-"*10)
    print('Test/Validation value counts:\n', X_val[column].value_counts())
    print("-"*10)
    
def concat_df(X_train, X_val):
    # Returns a concatenated df of training and test set on axis 0
    return pd.concat([X_train, X_val], sort=True).reset_index(drop=True)

def divide_df(X):
    # Returns divided dfs of training and test set
    X_train = X.loc[:890]
    X_val = X.loc[891:].drop(['Survived'],axis=1)
    return X_train,X_val

def load_all_data():
    X_train, X_val = load_data()
    return concat_df(X_train,X_val)

def show_data_na():
    data = load_data()
    X_train= data[0]
    X_val = data[1]
    print('Train columns with null values:\n', X_train.isnull().sum())
    print("-"*10)

    print('Test/Validation columns with null values:\n', X_val.isnull().sum())
    print("-"*10)
    
def show_data_info():
    data = load_data()
    X_train= data[0]
    X_val = data[1]
    print('Train columns info:\n', X_train.info())
    print("-"*10)
    print('Test/Validation info:\n', X_val.info())
    print("-"*10)   
    
def show_data_describe_column(column):
    data = load_data()
    X_train= data[0]
    X_val = data[1]
    print('Train columns describe:\n', X_train[column].describe())
    print('Test/Validation value counts:\n', X_train[column].value_counts())
    print("-"*10)
    print('Test/Validation describe:\n', X_val[column].describe())
    print('Test/Validation value counts:\n', X_val[column].value_counts())
    print("-"*10)    

In [6]:
load_all_data().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
Age            1046 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1308 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 122.8+ KB


In [7]:
# 通过分析相关系数,用groupby通过Sex和Pclass分级对Age求平均值进行缺失值填充.
def completing_age_na(dt):
    for X in dt:
        X['Age'] = X.groupby(['Sex', 'Pclass'])['Age'].apply(lambda x: x.fillna(x.median()))

In [8]:
# 用最多上岸港口进行填充
def completing_embarked_na(dt):
    for X in dt:
        X['Embarked'].fillna('S',inplace=True)

In [9]:
# Fare 只缺失一个值,通过相关系数分析,Fare和Parch,SibSp,Pclass 相关程度最高.
def completing_fare_na(dt):
    for X in dt:
        med_fare = X.groupby(['Pclass', 'Parch', 'SibSp']).Fare.median()[3][0][0]
        X['Fare'].fillna(med_fare, inplace = True)

In [10]:
def completing_cabin_na(dt,shrink):
    for X in dt:
        X['Deck'] = X['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'M')
        # T只有一个乘客,归为A类
        idx = X[X['Deck'] == 'T'].index
        X.loc[idx, 'Deck'] = 'A'
        # 缩减特征维度
        if shrink :
            X['Deck'] = X['Deck'].replace(['A', 'B', 'C'], 'ABC')
            X['Deck'] = X['Deck'].replace(['D', 'E'], 'DE')
            X['Deck'] = X['Deck'].replace(['F', 'G'], 'FG')

In [11]:
def drop_useless(dt,drop_column):
    for X in dt:
        X.drop(columns=drop_column, axis=1, inplace = True)
        X.reset_index()

## 创建新的特征

In [12]:
# 家庭大小.
def create_familysize(dt,shrink):
    for X in dt:
        X['FamilySize'] = X ['SibSp'] + X['Parch'] + 1
        # 高维特征
        if shrink:
            family_map = {1: 'Alone', 2: 'Small', 3: 'Small', 4: 'Small', 5: 'Medium', 6: 'Medium', 7: 'Large', 8: 'Large', 11: 'Large'}
            X['FamilySizeGrouped'] = X['FamilySize'].map(family_map)

In [13]:
# 许多乘客与团体一起旅行。这些团体由朋友，保姆，女佣等组成。他们不算家庭，但他们使用相同的票。
def create_ticket_freq(dt):
    for X in dt:
        X['TicketFrequency'] = X.groupby('Ticket')['Ticket'].transform('count')

In [14]:
# 票价分布 用频率划分
def create_FareBin(dt,bins = 13):
    for X in dt:
        X['FareBin'] = pd.qcut(X['Fare'], q = bins)

In [15]:
# 年龄 同样用频率划分
def create_AgeBin(dt,bins = 10):
    for X in dt:
        X['AgeBin'] = pd.qcut(X['Age'].astype(int), q = bins)

In [16]:
# 头衔
def create_title(dt,shrink):
    for X in dt:
        X['Title'] = X['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
        if shrink :
            X['IsMarried'] = 0
            X['IsMarried'].loc[X['Title'] == 'Mrs'] = 1
            X['Title'] = X['Title'].replace(['Miss', 'Mrs','Ms', 'Mlle', 'Lady', 'Mme', 'the Countess', 'Dona'], 'Miss/Mrs/Ms')
            X['Title'] = X['Title'].replace(['Dr', 'Col', 'Major', 'Jonkheer', 'Capt', 'Sir', 'Don'], 'Dr/Military/Noble/Clergy')
            # Leave Master and Rev

In [17]:
# 获取每个乘客的姓氏,
def create_familyonboard(dt):
    def extract_surname(X):    
        families = []
        for i in range(len(X)):        
            name = X.iloc[i]
            if '(' in name:
                name_no_bracket = name.split('(')[0] 
            else:
                name_no_bracket = name
            family = name_no_bracket.split(',')[0]
            title = name_no_bracket.split(',')[1].strip().split(' ')[0]
            for c in string.punctuation:
                family = family.replace(c, '').strip()
            families.append(family)
        return families
    for X in dt:
        X['FamilyOnBoard'] = extract_surname(X['Name'])
    #X['FamilyOnBoardSize'] = X.groupby('FamilyOnBoard')['FamilyOnBoard'].transform('count')

In [18]:
# 魔法特征.
def create_survival_rate(dt, shrink=True):
    X_train,X_val = dt[0],dt[1]

    # 创建票号或者家庭名称同时出现在验证集和训练集中的列表.
    non_unique_families = [x for x in X_train['FamilyOnBoard'].unique() \
                               if x in X_val['FamilyOnBoard'].unique()]
    non_unique_tickets = [x for x in X_train['Ticket'].unique() \
                              if x in X_val['Ticket'].unique()]

    # 多用于可视化
    df_family_survival_rate = X_train.groupby('FamilyOnBoard')['Survived', 'FamilyOnBoard','FamilySize'].median()
    df_ticket_survival_rate = X_train.groupby('Ticket')['Survived', 'Ticket','TicketFrequency'].median()

    family_rates = {}
    ticket_rates = {}

    # 搜集同时在训练集和验证集中家庭数目 > 1 的生存率,index是family的名称.1是FamilySize,0是生存率
    for i in range(len(df_family_survival_rate)):
        if df_family_survival_rate.index[i] in non_unique_families and \
            df_family_survival_rate.iloc[i, 1] > 1:
            family_rates[df_family_survival_rate.index[i]] = df_family_survival_rate.iloc[i, 0]

    # 搜集同时在训练集和验证集中票出现频率 > 1 的生存率,index是Ticket的名称.1是TicketFrequency,0是生存率
    for i in range(len(df_ticket_survival_rate)):
        if df_ticket_survival_rate.index[i] in non_unique_tickets and \
            df_ticket_survival_rate.iloc[i, 1] > 1:
            ticket_rates[df_ticket_survival_rate.index[i]] = df_ticket_survival_rate.iloc[i, 0]


    # 平均生存率
    mean_survival_rate = np.mean(X_train['Survived'])

    # 训练集和验证集的家庭所对应的生存率
    train_family_survival_rate = np.empty(X_train.shape[0])
    val_family_survival_rate = np.empty(X_val.shape[0])
    # 辅助特征*_family_survival_rate_NA,0或1分别代表同时没出现/出现在测试集和验证集中,没出现的赋予平均生存率
    train_family_survival_rate_NA = np.empty(X_train.shape[0],dtype='int32')
    val_family_survival_rate_NA = np.empty(X_val.shape[0],dtype='int32')

    # 训练集
    for i in range(len(X_train)):
        familyOnBoard = X_train['FamilyOnBoard'].iloc[i]
        if familyOnBoard in family_rates:
            train_family_survival_rate[i] = family_rates[familyOnBoard]
            train_family_survival_rate_NA[i] = 1
        else:
            train_family_survival_rate[i] = mean_survival_rate
            train_family_survival_rate_NA[i] = 0

    # 验证集
    for i in range(len(X_val)):
        familyOnBoard = X_val['FamilyOnBoard'].iloc[i]
        if familyOnBoard in family_rates:
            val_family_survival_rate[i] = family_rates[familyOnBoard]
            val_family_survival_rate_NA[i] = 1
        else:
            val_family_survival_rate[i] = mean_survival_rate
            val_family_survival_rate_NA[i] = 0

    X_train['FamilySurvivalRate'] = train_family_survival_rate
    X_train['FamilySurvivalRateNA'] = train_family_survival_rate_NA
    X_val['FamilySurvivalRate'] = val_family_survival_rate
    X_val['FamilySurvivalRateNA'] = val_family_survival_rate_NA

    # 训练集和验证集的票号所对应的生存率
    train_ticket_survival_rate = np.empty(X_train.shape[0])
    val_ticket_survival_rate = np.empty(X_val.shape[0])
    # 辅助特征*_ticket_survival_rate_NA,0或1分别代表同时没出现/出现在测试集和验证集中,没出现的赋予平均生存率
    train_ticket_survival_rate_NA = np.empty(X_train.shape[0],dtype='int32')
    val_ticket_survival_rate_NA = np.empty(X_val.shape[0],dtype='int32')

    for i in range(len(X_train)):
        ticket = X_train['Ticket'].iloc[i]
        if ticket in ticket_rates:
            train_ticket_survival_rate[i] = ticket_rates[ticket]
            train_ticket_survival_rate_NA[i] = 1
        else:
            train_ticket_survival_rate[i] = mean_survival_rate
            train_ticket_survival_rate_NA[i] = 0

    for i in range(len(X_val)):
        ticket = X_val['Ticket'].iloc[i]
        if ticket in ticket_rates:
            val_ticket_survival_rate[i] = ticket_rates[ticket]
            val_ticket_survival_rate_NA[i] = 1
        else:
            val_ticket_survival_rate[i] = mean_survival_rate
            val_ticket_survival_rate_NA[i] = 0

    X_train['TicketSurvivalRate'] = train_ticket_survival_rate
    X_train['TicketSurvivalRateNA'] = train_ticket_survival_rate_NA
    X_val['TicketSurvivalRate'] = val_ticket_survival_rate
    X_val['TicketSurvivalRateNA'] = val_ticket_survival_rate_NA

    if shrink:
        for X_sub in [X_train, X_val]:
            X_sub['SurvivalRate'] = (X_sub['TicketSurvivalRate'] + X_sub['FamilySurvivalRate']) / 2
            X_sub['SurvivalRateNA'] = (X_sub['TicketSurvivalRateNA'] + X_sub['FamilySurvivalRateNA']) / 2  
    
    dt[0]=X_train
    dt[1]=X_val
    return dt

## 数据的转换,离散化

In [19]:
def convert_X(dt):
    for X in dt:
        X['AgeBin'] = LabelEncoder().fit_transform(X['AgeBin'])
        X['FareBin'] = LabelEncoder().fit_transform(X['FareBin'])
        X['Deck'] = LabelEncoder().fit_transform(X['Deck'])
        X['Title'] = LabelEncoder().fit_transform(X['Title'])
        X['Sex'] = LabelEncoder().fit_transform(X['Sex'])
        X['Embarked'] = LabelEncoder().fit_transform(X['Embarked'])
        X['FamilySizeGrouped'] = LabelEncoder().fit_transform(X['FamilySizeGrouped'])

In [20]:
def standarized(dt,columns):
    for X in dt:
        for c in columns:
            X_c = X[c]
            X_c = StandardScaler().fit_transform(np.array(X_c).reshape(-1,1))
            X[c] = X_c.ravel()

In [21]:
def get_clean_data():
    dt = load_data() # 综合在一起处理
    # ------------------ 缺失值处理 ------------------
    # 处理年龄缺失值
    completing_age_na(dt)
    # 处理cabin缺失值
    completing_cabin_na(dt,True)
    # 处理embarked缺失值
    completing_embarked_na(dt)
    # 处理fare缺失值
    completing_fare_na(dt)
    # ------------------ 创建新的特征 --------------------
    # 家庭成员多少
    create_familysize(dt,True)
    # 票频率
    create_ticket_freq(dt)
    # 年龄分段
    create_AgeBin(dt)
    # 票价分段
    create_FareBin(dt)
    # 头衔
    create_title(dt,True)
    # 家族名称
    create_familyonboard(dt)
    # 生存率
    dt = create_survival_rate(dt, True)
    # ------------------ 数据离散化 归一化 --------------------
    # 离散化数据
    convert_X(dt)
    
    # 丢弃无用的列
    drop_useless(
        dt 
        ,[
          'PassengerId'
        , 'Cabin'
        , 'Ticket'
        , 'Name'
        , 'Fare'
        #, 'Sex' 
        , 'FamilySize'
        , 'FamilyOnBoard'
        , 'Age'
        #, 'Title'
        #, 'Embarked'
        , 'SibSp'
        , 'Parch'
        , 'SurvivalRate'
        , 'SurvivalRateNA'
        , 'FamilySurvivalRate'
        , 'FamilySurvivalRateNA'
        #, 'TicketSurvivalRate'
        #, 'TicketSurvivalRateNA'
    ])
    
    # 升维
    dummy_columns = [
        'Embarked'
        ,'Sex'
        ,'Title'
        ,'Pclass'
        ,'Deck'
        #,'FamilySize'
        #,'TicketFrequency'
        ,'FamilySizeGrouped'
        #,'AgeBin'
        #,'FareBin'
    ]
    
    
    #standarized(X,columns =['AgeBin','FareBin','TicketFrequency'])
    #X = pd.DataFrame(X)
    X_train,X_val=dt[0],dt[1]
    X_train = pd.get_dummies(X_train,columns=dummy_columns)
    X_val = pd.get_dummies(X_val,columns=dummy_columns)
    y_train = X_train['Survived']
    X_train.drop(columns=['Survived'], axis = 1, inplace = True)
    
    #X_train = StandardScaler().fit_transform(X_train)
    #X_val = StandardScaler().fit_transform(X_val)
    
    
    return X_train , y_train , X_val

In [22]:
X_train ,y_train, X_val = get_clean_data()
X_train.sample(5)

Unnamed: 0,TicketFrequency,AgeBin,FareBin,IsMarried,TicketSurvivalRate,TicketSurvivalRateNA,Embarked_0,Embarked_1,Embarked_2,Sex_0,...,Pclass_2,Pclass_3,Deck_0,Deck_1,Deck_2,Deck_3,FamilySizeGrouped_0,FamilySizeGrouped_1,FamilySizeGrouped_2,FamilySizeGrouped_3
185,1,7,10,0,0.383838,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,0
828,1,3,1,0,0.383838,0,0,1,0,0,...,0,1,0,0,0,1,1,0,0,0
395,1,2,2,0,0.383838,0,0,0,1,0,...,0,1,0,0,0,1,1,0,0,0
88,4,3,12,0,0.5,1,0,0,1,1,...,0,0,1,0,0,0,0,0,1,0
567,4,5,7,1,0.0,1,0,0,1,1,...,0,1,0,0,0,1,0,0,1,0


In [23]:
X_train.columns

Index(['TicketFrequency', 'AgeBin', 'FareBin', 'IsMarried',
       'TicketSurvivalRate', 'TicketSurvivalRateNA', 'Embarked_0',
       'Embarked_1', 'Embarked_2', 'Sex_0', 'Sex_1', 'Title_0', 'Title_1',
       'Title_2', 'Title_3', 'Title_4', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Deck_0', 'Deck_1', 'Deck_2', 'Deck_3', 'FamilySizeGrouped_0',
       'FamilySizeGrouped_1', 'FamilySizeGrouped_2', 'FamilySizeGrouped_3'],
      dtype='object')

In [24]:
X_val.columns

Index(['TicketFrequency', 'AgeBin', 'FareBin', 'IsMarried',
       'TicketSurvivalRate', 'TicketSurvivalRateNA', 'Embarked_0',
       'Embarked_1', 'Embarked_2', 'Sex_0', 'Sex_1', 'Title_0', 'Title_1',
       'Title_2', 'Title_3', 'Title_4', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Deck_0', 'Deck_1', 'Deck_2', 'Deck_3', 'FamilySizeGrouped_0',
       'FamilySizeGrouped_1', 'FamilySizeGrouped_2', 'FamilySizeGrouped_3'],
      dtype='object')

In [25]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 27 columns):
TicketFrequency         891 non-null int64
AgeBin                  891 non-null int64
FareBin                 891 non-null int64
IsMarried               891 non-null int64
TicketSurvivalRate      891 non-null float64
TicketSurvivalRateNA    891 non-null int32
Embarked_0              891 non-null uint8
Embarked_1              891 non-null uint8
Embarked_2              891 non-null uint8
Sex_0                   891 non-null uint8
Sex_1                   891 non-null uint8
Title_0                 891 non-null uint8
Title_1                 891 non-null uint8
Title_2                 891 non-null uint8
Title_3                 891 non-null uint8
Title_4                 891 non-null uint8
Pclass_1                891 non-null uint8
Pclass_2                891 non-null uint8
Pclass_3                891 non-null uint8
Deck_0                  891 non-null uint8
Deck_1                  891 non-n

In [26]:
def plot_learning_curve_1(
    estimator
    ,title
    , X
    , y
    ,ax=None #选择子图
    ,ylim=None #设置纵坐标的取值范围
    ,cv=None #交叉验证
    ,n_jobs=-1 #设定索要使用的线程
    ):
    
    train_sizes, train_scores, test_scores = learning_curve(
        estimator
        , X
        , y
        ,shuffle=True
        ,cv=cv
        ,n_jobs=n_jobs
    )
    if ax == None:
        ax = plt.gca()
    else:
        ax = plt.figure()
    ax.set_title(title)
    if ylim is not None:
        ax.set_ylim(*ylim)
    ax.set_xlabel("Training examples")
    ax.set_ylabel("Score")
    ax.grid() #绘制网格，不是必须
    ax.plot(train_sizes, np.mean(train_scores, axis=1), 'o-', color="r",label="Training score")
    ax.plot(train_sizes, np.mean(test_scores, axis=1), 'o-'
    , color="g",label="Test score")
    ax.legend(loc="best")
    print('overfiting : ' ,(np.mean(train_scores, axis=1)[-1] - np.mean(test_scores, axis=1)[-1]))
    print(np.mean(test_scores, axis=1)[-1])
    return ax

def plot_learning_curve(clf,X_train,y_train,cv_s=10):
    cv = KFold(n_splits=cv_s, shuffle = True)
    plot_learning_curve_1(
        clf
        ,'clf'
        ,X_train
        ,y_train
        ,cv=cv
    )
    plt.show()

In [27]:
def plot_xgbcv_learning_curve(params,dtrain,T=100,cv=10,metrics=['error']):
    trees = np.arange(0,T,1)
    cv_res = xgb.cv(params=params,dtrain=dtrain,num_boost_round=T,nfold=10,metrics=metrics)
    train_err = cv_res.iloc[:,0]
    test_err = cv_res.iloc[:,2]
    gap = test_err - train_err
    
    #print('min err train arg: train :',np.argmin(train_err) ,' test : ',np.argmin(test_err))
    #print('min err train train :',np.min(train_err) ,' test : ',np.min(test_err))

    print('min err gap :', np.min(np.abs(train_err - test_err)))
    print('last err gap',test_err.iloc[-1]-train_err.iloc[-1])

    argmin = np.argmin(np.abs(train_err - test_err))
    print('min err gap arg:', argmin)
    print('min err gap arg:', train_err[argmin] , ' ||| ' ,test_err[argmin])
    
    print('last err train : ',train_err.iloc[-1] ,' test : ',test_err.iloc[-1])

    plt.clf()
    plt.figure(figsize=(20,5))
    plt.plot(trees,test_err-train_err,label='err gap')
    plt.plot(trees,train_err,label='train err')
    plt.plot(trees,test_err,label='test err')
    plt.legend()
    plt.show()
    return test_err

In [28]:
'''
params={
    'gamma' : 3
    ,'max_depth':20
    #, 'eta': 0.2
    #, 'min_child_weight': 4
    #, 'alpha':5
    #, 'lambda':3
    #, 'colsample_bylevel':0.75
    #,'objective':'binary:logistic'
}
T=20
dtrain = xgb.DMatrix(X_train,y_train)
plot_xgbcv_learning_curve(params,dtrain,T=T)
model = xgb.train(params,dtrain,num_boost_round=T)
'''
'''
xgb.plot_importance(model)
'''
'''
T=10
dtrain = xgb.DMatrix(X_train,y_train)
dtest = xgb.DMatrix(X_val)
model = xgb.train(params,dtrain,num_boost_round=T)
res = np.zeros(len(X_val),dtype='int64')
res[model.predict(dtest)>=0.5]=1

_,X_res = load_data()
X_res['Survived'] = res
submit = X_res[['PassengerId','Survived']]
submit.to_csv("./result/submit_amy.csv", index=False)

print('Validation Data Distribution: \n', X_res['Survived'].value_counts(normalize = True))
submit.sample(10)
'''

'\nT=10\ndtrain = xgb.DMatrix(X_train,y_train)\ndtest = xgb.DMatrix(X_val)\nmodel = xgb.train(params,dtrain,num_boost_round=T)\nres = np.zeros(len(X_val),dtype=\'int64\')\nres[model.predict(dtest)>=0.5]=1\n\n_,X_res = load_data()\nX_res[\'Survived\'] = res\nsubmit = X_res[[\'PassengerId\',\'Survived\']]\nsubmit.to_csv("./result/submit_amy.csv", index=False)\n\nprint(\'Validation Data Distribution: \n\', X_res[\'Survived\'].value_counts(normalize = True))\nsubmit.sample(10)\n'

In [30]:
param_grid={
    'n_estimators':np.arange(100,1000,100),
    'max_depth':np.arange(1,6,1),
    'min_samples_split':np.linspace(0.01,1.,30),
    'min_samples_leaf':np.arange(1,30,1)
}
gcv = GridSearchCV(RandomForestClassifier(),param_grid=param_grid,verbose=True,cv=5,n_jobs=-1).fit(X_train,y_train)

Fitting 5 folds for each of 39150 candidates, totalling 195750 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   15.4s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   34.1s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   60.0s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 6034 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done 8434 tasks      | elapsed: 10.4min
[Parallel(n_jobs=-1)]: Done 9784 tasks      | elapsed: 12.1min
[Parallel(n_jobs=-1)]: Done 11234 tasks      |

KeyboardInterrupt: 

In [None]:
rf = RandomForestClassifier(
    n_estimators=100
    ,max_depth=3
    #,min_samples_split=15
    #,min_samples_leaf=7
    #,random_state=5
)
plot_learning_curve(rf,X_train,y_train)

In [None]:
from sklearn.externals import joblib
joblib.dump(rf, "rf_model.m")
plot_learning_curve(rf,X_train,y_train)

In [None]:
def show_estimator(X_train,y_train):
    train_scores = []
    test_scores=[]
    #for n_estimators in np.arange(50, 200, 50):
    n_estimators=400
    for max_depth in np.arange(1,20,1):
        rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,n_jobs=-1)
        cv_score = cross_validate(rf,X_train,y_train,return_train_score=True)
        test_scores.append(cv_score['test_score'].mean())
        train_scores.append(cv_score['train_score'].mean())
    return train_scores,test_scores

train_scores,test_scores = show_estimator(X_train,y_train)

In [None]:
gaps = np.array(train_scores) - np.array(test_scores)
indexes=range(len(train_scores))
tr=train_scores[:10]
te=test_scores[:10]
ga=gaps[:10]
ind=indexes[:10]
plt.plot(ind,tr)
plt.plot(ind,te)
plt.plot(ind,ga)
plt.show()

In [None]:
rf.fit(X_train,y_train)
res = rf.predict(X_val)
_,X_res=  load_data()
X_res['Survived'] = res
submit = X_res[['PassengerId','Survived']]
submit.to_csv("./result/submit_amy.csv", index=False)

In [None]:
tit = pd.read_csv('./CheatingData/titanic.csv')
tit['survived'].mean()