In [2]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


In [3]:
data_train = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv')
data_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


   <ol>
<li>看過電影都知道，老弱婦孺先上逃生艇，Age 和 Sex 肯定是重要變數。</li>
<li>把年齡、票價作離散化切檻。</li>
<li>客艙 Cabin 的第一個字母感覺有意義，獨立拉出來。</li>
<li>把姓名的 Mr. Ms. 稱謂分離出來當特徵。</li>
<li>刪掉 Ticket、Name、Embarked 感覺沒有用的欄位。</li>
    </ol>

In [4]:
def simplify_ages(df):
    df.Age = df.Age.fillna(-0.5)
    bins = (-1, 0, 5, 12, 18, 25, 35, 60, 120)
    group_names = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Senior']
    categories = pd.cut(df.Age, bins, labels=group_names)
    df.Age = categories
    return df

def simplify_cabins(df):
    df.Cabin = df.Cabin.fillna('N')
    df.Cabin = df.Cabin.apply(lambda x: x[0])
    return df

def simplify_fares(df):
    df.Fare = df.Fare.fillna(-0.5)
    bins = (-1, 0, 8, 15, 31, 1000)
    group_names = ['Unknown', '1_quartile', '2_quartile', '3_quartile', '4_quartile']
    categories = pd.cut(df.Fare, bins, labels=group_names)
    df.Fare = categories
    return df

def format_name(df):
    df['Lname'] = df.Name.apply(lambda x: x.split(' ')[0])
    df['NamePrefix'] = df.Name.apply(lambda x: x.split(' ')[1])
    return df    
    
def drop_features(df):
    return df.drop(['Ticket', 'Name', 'Embarked'], axis=1)

def transform_features(df):
    df = simplify_ages(df)
    df = simplify_cabins(df)
    df = simplify_fares(df)
    df = format_name(df)
    df = drop_features(df)
    return df

data_train = transform_features(data_train)
data_test = transform_features(data_test)
data_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Lname,NamePrefix
0,1,0,3,male,Student,1,0,1_quartile,N,"Braund,",Mr.
1,2,1,1,female,Adult,1,0,4_quartile,C,"Cumings,",Mrs.
2,3,1,3,female,Young Adult,0,0,1_quartile,N,"Heikkinen,",Miss.
3,4,1,1,female,Young Adult,1,0,4_quartile,C,"Futrelle,",Mrs.
4,5,0,3,male,Young Adult,0,0,2_quartile,N,"Allen,",Mr.


In [4]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data_train.drop(['PassengerId'], axis=1), random_state=100, train_size=0.8)

train_data.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
408,0,3,"Birkeland, Mr. Hans Martin Monsen",male,21.0,0,0,312992,7.775,,S
480,0,3,"Goodwin, Master. Harold Victor",male,9.0,5,2,CA 2144,46.9,,S
510,1,3,"Daly, Mr. Eugene Patrick",male,29.0,0,0,382651,7.75,,Q
609,1,1,"Shutes, Miss. Elizabeth W",female,40.0,0,0,PC 17582,153.4625,C125,S
547,1,2,"Padro y Manent, Mr. Julian",male,,0,0,SC/PARIS 2146,13.8625,,C


In [5]:
from pycaret.classification import *

clf1 = setup(data = train_data, 
             target = 'Survived', 
             categorical_features = ['Pclass','Sex','Age','Fare','Cabin','Lname','NamePrefix'])


ValueError: Column type forced  >> Lname << doesn't exist in the dataset.

In [None]:
compare_models(fold = 5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.8315,0.8842,0.7297,0.8041,0.763,0.6329,0.6365,0.322
rf,Random Forest Classifier,0.8314,0.8736,0.7243,0.8076,0.7621,0.6321,0.6359,0.162
ridge,Ridge Classifier,0.8313,0.0,0.7568,0.7873,0.7698,0.6369,0.6391,0.018
lr,Logistic Regression,0.8253,0.881,0.7405,0.7849,0.7595,0.6227,0.6258,1.104
knn,K Neighbors Classifier,0.8193,0.8497,0.7189,0.7793,0.7464,0.6066,0.6091,0.742
lightgbm,Light Gradient Boosting Machine,0.8133,0.8788,0.7297,0.7697,0.7421,0.5969,0.6042,0.048
svm,SVM - Linear Kernel,0.8132,0.0,0.7351,0.7741,0.7458,0.5991,0.6071,0.03
dt,Decision Tree Classifier,0.8052,0.7875,0.7189,0.7577,0.7339,0.5807,0.5848,0.018
gbc,Gradient Boosting Classifier,0.7931,0.8623,0.6162,0.7857,0.6876,0.5368,0.5479,0.17
ada,Ada Boost Classifier,0.7831,0.8626,0.6216,0.767,0.6818,0.52,0.5307,0.062


ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=4723, verbose=0,
                     warm_start=False)

In [12]:
lr = create_model('lr')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8,0.842,0.6667,0.75,0.7059,0.5552,0.5574
1,0.86,0.8733,0.8333,0.7895,0.8108,0.6998,0.7005
2,0.78,0.8993,0.5556,0.7692,0.6452,0.4917,0.5054
3,0.88,0.9423,0.7895,0.8824,0.8333,0.74,0.7428
4,0.84,0.8888,0.7368,0.8235,0.7778,0.6534,0.6558
5,0.76,0.8362,0.8421,0.64,0.7273,0.52,0.5357
6,0.84,0.8956,0.7368,0.8235,0.7778,0.6534,0.6558
7,0.92,0.9525,0.8947,0.8947,0.8947,0.8302,0.8302
8,0.7755,0.8244,0.6667,0.7059,0.6857,0.5113,0.5118
9,0.7959,0.7814,0.7778,0.7,0.7368,0.5709,0.573


In [1]:
ridge = create_model('ridge')
lda = create_model('lda')
gbc = create_model('gbc')

stacker = stack_models(estimator_list = [ridge,lda,gbc], meta_model = lr)
stacker

NameError: name 'create_model' is not defined