In [None]:
# Ignore useless warnings
import warnings
warnings.filterwarnings(action="ignore")

# Essentials
import numpy as np
import pandas as pd

# Plots
import seaborn as sns
import matplotlib.pyplot as plt

# Models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from pystacknet.pystacknet import StackNetClassifier

# Stats
from scipy.stats import skew, norm
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

# Misc
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA

# missing data imputation
from feature_engine import missing_data_imputers as mdi
from feature_engine import categorical_encoders as ce

# Numerical Variable Transformation
from feature_engine import variable_transformers as vt

In [2]:
# Read in the dataset as a dataframe
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.shape, test.shape

((891, 12), (418, 11))

In [3]:
# Split features and labels
train_labels = train['Survived'].reset_index(drop=True)
train_features = train.drop(['Survived'], axis=1)
test_features = test

# Combine train and test features in order to apply the feature transformation pipeline to the entire dataset
all_features = pd.concat([train_features, test_features]).reset_index(drop=True)
all_features.shape

(1309, 11)

In [20]:
all_features.head()

Unnamed: 0,Pclass,Name,Sex,Age,Ticket,Fare,Cabin,Embarked,fam_size
0,3,"Braund, Mr. Owen Harris",male,22.0,A/5 21171,7.25,missing,S,2
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,PC 17599,71.2833,C,C,2
2,3,"Heikkinen, Miss. Laina",female,26.0,STON/O2. 3101282,7.925,missing,S,1
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,113803,53.1,C,S,2
4,3,"Allen, Mr. William Henry",male,35.0,373450,8.05,missing,S,1


In [6]:
all_features.drop('PassengerId', axis=1, inplace=True)

In [65]:
all_features.isnull().sum()

Pclass      0
Sex         0
Age         0
Fare        0
Cabin       0
Embarked    2
fam_size    0
Age_na      0
Title       0
dtype: int64

In [13]:
all_features['Cabin'] = pd.Series([i[0] if not pd.isnull(i) else "missing" for i in all_features['Cabin']])

In [16]:
len(all_features['Cabin'].unique())

9

In [17]:
all_features['fam_size'] = all_features['SibSp'] + all_features['Parch'] + 1

In [19]:
all_features.drop(['SibSp', 'Parch'], axis=1, inplace=True)

In [29]:
imputer = mdi.AddNaNBinaryImputer(variables = ['Age'])

In [30]:
imputer.fit(all_features)

AddNaNBinaryImputer(variables=['Age'])

In [31]:
all_features = imputer.transform(all_features)

In [32]:
imputer = mdi.MeanMedianImputer(imputation_method='mean', variables=['Age', 'Fare'])

In [33]:
imputer.fit(all_features)

MeanMedianImputer(imputation_method='mean', variables=['Age', 'Fare'])

In [34]:
all_features = imputer.transform(all_features)

In [80]:
all_features.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Cabin,Embarked,fam_size,Age_na,Title
0,3,male,22.0,1.825248,missing,S,2,0,Mr
1,1,female,38.0,3.213829,C,C,2,0,Mrs
2,3,female,26.0,1.88333,missing,S,1,0,Miss
3,1,female,35.0,3.052083,C,S,2,0,Mrs
4,3,male,35.0,1.893535,missing,S,1,0,Mr


In [98]:
len(all_features['fam_size'].unique())

4

In [38]:
tmp = all_features.copy(deep=True)

In [39]:
all_features.drop('Ticket', axis=1, inplace=True)

In [43]:
all_features['Title'] = [i.split(',')[1].split('.')[0] for i in all_features['Name']]

In [55]:
all_features['Title'].value_counts()/len(all_features)

 Mr        0.578304
 Miss      0.198625
 Mrs       0.150497
 Master    0.046600
Rare       0.025974
Name: Title, dtype: float64

In [51]:
# rare categories encoder
encoder = ce.RareLabelCategoricalEncoder(tol=0.03, n_categories=5, # minumum number of categories the variable has
                                         variables=['Title'])

In [52]:
encoder.fit(all_features)

RareLabelCategoricalEncoder(n_categories=5, tol=0.03, variables=['Title'])

In [53]:
all_features = encoder.transform(all_features)

In [56]:
all_features.drop('Name', axis=1, inplace=True)

In [70]:
all_features['Embarked'] = all_features['Embarked'].fillna('S')

In [71]:
all_features.isnull().sum()

Pclass      0
Sex         0
Age         0
Fare        0
Cabin       0
Embarked    0
fam_size    0
Age_na      0
Title       0
dtype: int64

In [77]:
tf = vt.YeoJohnsonTransformer(variables = ['Fare'])

In [78]:
tf.fit(all_features)

YeoJohnsonTransformer(variables=['Fare'])

In [79]:
all_features = tf.transform(all_features)

In [119]:
# Recreate training and test sets
X = all_features.iloc[:len(train_labels), :]

X_test = all_features.iloc[len(train_labels):, :]

X.shape, train_labels.shape, X_test.shape

((891, 9), (891,), (418, 9))

In [127]:
encoder = ce.MeanCategoricalEncoder(variables=['Embarked'])

In [97]:
all_features['fam_size'] = all_features['fam_size'].map(lambda x: 0 if x==1 else x)
all_features['fam_size'] = all_features['fam_size'].map(lambda x: 1 if x==2 else x)
all_features['fam_size'] = all_features['fam_size'].map(lambda x: 2 if x==3 or x ==4 else x)
all_features['fam_size'] = all_features['fam_size'].map(lambda x: 3 if x>4 else x)

In [112]:
all_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    1309 non-null   int64  
 1   Sex       1309 non-null   object 
 2   Age       1309 non-null   float64
 3   Fare      1309 non-null   float64
 4   Cabin     1309 non-null   object 
 5   Embarked  1309 non-null   object 
 6   fam_size  1309 non-null   int64  
 7   Age_na    1309 non-null   int64  
 8   Title     1309 non-null   object 
dtypes: float64(2), int64(3), object(4)
memory usage: 92.2+ KB


In [113]:
for i in ['Pclass', 'fam_size', 'Age_na']:
    all_features[i] = all_features[i].astype(str)


In [128]:
encoder.fit(X, train_labels)

MeanCategoricalEncoder(variables=['Embarked'])

In [129]:
X = encoder.transform(X)
X_test = encoder.transform(X_test)

In [130]:
X.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Cabin,Embarked,fam_size,Age_na,Title
0,0.242363,0.188908,22.0,1.825248,0.299854,0.339009,0.552795,0,0.156673
1,0.62963,0.742038,38.0,3.213829,0.59322,0.553571,0.552795,0,0.792
2,0.242363,0.742038,26.0,1.88333,0.299854,0.339009,0.303538,0,0.697802
3,0.62963,0.742038,35.0,3.052083,0.59322,0.339009,0.552795,0,0.792
4,0.242363,0.188908,35.0,1.893535,0.299854,0.339009,0.303538,0,0.156673


In [132]:
# Models
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import RidgeCV, Ridge
from sklearn.svm import SVR
from mlxtend.regressor import StackingCVRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.wrappers.scikit_learn import KerasRegressor
from pystacknet.pystacknet import StackNetRegressor

In [135]:
# Light Gradient Boosting Regressor
lightgbm = LGBMRegressor(objective='regression', 
                       num_leaves=6,
                       learning_rate=0.01, 
                       n_estimators=7000,
                       max_bin=200, 
                       bagging_fraction=0.8,
                       bagging_freq=4, 
                       bagging_seed=8,
                       feature_fraction=0.2,
                       feature_fraction_seed=8,
                       min_sum_hessian_in_leaf = 11,
                       verbose=-1,
                       random_state=42)

# XGBoost Regressor
xgboost = XGBRegressor(learning_rate=0.01,
                       n_estimators=6000,
                       max_depth=4,
                       min_child_weight=0,
                       gamma=0.6,
                       subsample=0.7,
                       colsample_bytree=0.7,
                       objective='reg:linear',
                       nthread=-1,
                       scale_pos_weight=1,
                       seed=27,
                       reg_alpha=0.00006,
                       random_state=42)

# Ridge Regressor
ridge_alphas = [1e-15, 1e-10, 1e-8, 9e-4, 7e-4, 5e-4, 3e-4, 1e-4, 1e-3, 5e-2, 1e-2, 0.1, 0.3, 1, 3, 5, 10, 15, 18, 20, 30, 50, 75, 100]
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=ridge_alphas, cv=kf))

# Support Vector Regressor
svr = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003))

# Gradient Boosting Regressor
gbr = GradientBoostingRegressor(n_estimators=6000,
                                learning_rate=0.01,
                                max_depth=4,
                                max_features='sqrt',
                                min_samples_leaf=15,
                                min_samples_split=10,
                                loss='huber',
                                random_state=42)  

# Random Forest Regressor
rf = RandomForestRegressor(n_estimators=1200,
                          max_depth=15,
                          min_samples_split=5,
                          min_samples_leaf=5,
                          max_features=None,
                          oob_score=True,
                          random_state=42)

# Stack up all the models above, optimized using xgboost
stack_gen = StackingCVRegressor(regressors=(xgboost, lightgbm, svr, ridge, gbr, rf),
                                meta_regressor=xgboost,
                                use_features_in_secondary=True)

In [134]:
# Setup cross validation folds
kf = KFold(n_splits=8, random_state=42, shuffle=True)

In [1]:
FOLDS=4
METRIC="accuracy"
RESTACKING=False
RETRAIN=True
SEED=12345
VERBOSE=1

models=[
    #1ST level #
    
    [lightgbm, xgboost, rf, gbr],
    
        #2ND level # 

        [ridge],    

]

#MODEL STATEMENT
model=StackNetClassifier(models, metric=METRIC, folds=FOLDS,
    restacking=RESTACKING, use_retraining=RETRAIN, 
    random_state=SEED, verbose=VERBOSE, n_jobs=-1, use_proba=True)

NameError: name 'lightgbm' is not defined

In [137]:
model.fit(X.values, train_labels)

Input Dimensionality 9 at Level 0 
5 models included in Level 0 
Level 0, fold 1/4 , model 0 , accuracy===0.820628 
Level 0, fold 1/4 , model 1 , accuracy===0.802691 
Level 0, fold 1/4 , model 2 , accuracy===0.802691 
Level 0, fold 1/4 , model 3 , accuracy===0.798206 
Level 0, fold 1/4 , model 4 , accuracy===0.807175 
Level 0, fold 2/4 , model 0 , accuracy===0.843049 
Level 0, fold 2/4 , model 1 , accuracy===0.838565 
Level 0, fold 2/4 , model 2 , accuracy===0.834081 
Level 0, fold 2/4 , model 3 , accuracy===0.847534 
Level 0, fold 2/4 , model 4 , accuracy===0.838565 
Level 0, fold 3/4 , model 0 , accuracy===0.852018 
Level 0, fold 3/4 , model 1 , accuracy===0.874439 
Level 0, fold 3/4 , model 2 , accuracy===0.847534 
Level 0, fold 3/4 , model 3 , accuracy===0.856502 
Level 0, fold 3/4 , model 4 , accuracy===0.874439 
Level 0, fold 4/4 , model 0 , accuracy===0.851351 
Level 0, fold 4/4 , model 1 , accuracy===0.837838 
Level 0, fold 4/4 , model 2 , accuracy===0.837838 
Level 0, fold 4/4

In [138]:
preds = np.rint(model.predict_proba(X_test.values)[:, 1])

1 estimators included in Level 0 
1 estimators included in Level 1 


In [139]:
preds = [int(i) for i in preds]

In [140]:
# Read in sample_submission dataframe
submission = pd.read_csv("gender_submission.csv")
submission.shape

(418, 2)

In [141]:
# Append predictions from blended models
submission.iloc[:,1] = preds

In [142]:
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [144]:
submission.set_index('PassengerId').to_csv('submission.csv')