In [23]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, cross_validate,train_test_split
from sklearn.compose import make_column_transformer
from sklearn.dummy import DummyClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from xgboost import XGBClassifier


In [24]:
# For comparing our model scores
modelScores = {}
space_df = pd.read_csv('data/train.csv')
space_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [25]:
# Split the dataset into training and testing data
train_df, test_df = train_test_split(space_df, test_size=.2, random_state=123)
X_train, y_train = train_df.drop(columns=['Transported']), train_df[['Transported']]
X_test, y_test = test_df.drop(columns=['Transported']), test_df[['Transported']]

In [26]:
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_transformer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
%matplotlib inline

In [27]:
spaceTrain = pd.read_csv('data/train.csv')
spaceTest = pd.read_csv('data/test.csv')

In [28]:
# Let's deal with the slashes in the 'cabin' column first:
def cabinUpdate(df):
    # Split Cabin columns string on the slash and rename its columns
    cabinNew = df.Cabin.str.split('/', expand=True)
    cabinNew.columns = ['Cabin-1', 'Cabin-2', 'Cabin-3']
    # Copy the input dataframe to avoid mutating the original
    df1 = df.copy()
    # Add expanded columns to our copied dataframe:
    for col in cabinNew:
        df1[col] = cabinNew[col]
        print(cabinNew[col].value_counts())
    # Reorder the columns in the new dataframe:
    cols = df1.columns.tolist()
    cols = cols[:3] + cols[-3:] + cols[4:-3]
    df1 = df1[cols]
    return df1
# Update our DF's structure based on the Cabin column
spaceTrain = cabinUpdate(spaceTrain).copy()
# Split data into train and test
train_df, test_df = train_test_split(spaceTrain, test_size=.2, random_state=123)
X_train, y_train = train_df.drop(columns=['Transported']), train_df['Transported']
X_test, y_test = test_df.drop(columns=['Transported']), test_df['Transported']
X_train.head()

F    2794
G    2559
E     876
B     779
C     747
D     478
A     256
T       5
Name: Cabin-1, dtype: int64
82      28
86      22
19      22
56      21
176     21
        ..
1644     1
1515     1
1639     1
1277     1
1894     1
Name: Cabin-2, Length: 1817, dtype: int64
S    4288
P    4206
Name: Cabin-3, dtype: int64


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin-1,Cabin-2,Cabin-3,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
7074,7527_01,Earth,False,F,1561,P,TRAPPIST-1e,18.0,False,0.0,732.0,2.0,13.0,47.0,Elle Flowensley
6710,7083_01,Europa,True,C,259,S,TRAPPIST-1e,32.0,False,0.0,0.0,0.0,0.0,0.0,Betenar Pirejus
2569,2755_01,Europa,True,B,85,P,55 Cancri e,36.0,False,0.0,0.0,0.0,0.0,0.0,Beneba Glousspidy
3850,4127_01,Europa,,B,130,P,TRAPPIST-1e,23.0,False,0.0,0.0,0.0,0.0,,Dsch Ainserfle
7450,7972_02,Europa,False,B,260,P,TRAPPIST-1e,35.0,False,,246.0,25.0,42.0,3730.0,Atinon Cattyried


In [8]:
# Define our categories of columns:
cat = ['HomePlanet',  'Cabin-1', 'Cabin-3', 'Destination']
num = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin-2']
binary = ['CryoSleep', 'VIP']
drop = ['Name']
target = 'Transported'
# Make the column transformer:
preprocessor = make_column_transformer(
        # Apply standard scaling to all our numeric features
        (make_pipeline(SimpleImputer(strategy='mean'), StandardScaler()), num),
        # Make a pipeline for our categorical features
        # 1. First fill NaN's with most frequent of each feature
        # 2. Then use one hot encoding on result
        (make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore')), cat),
        # One-hot encoding of our binary column, 'sex,' so we aren't wasting space
        (make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(drop="if_binary", dtype=int,handle_unknown='ignore')), binary),
        ('drop', drop)
)

In [9]:
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [12]:
# Now fit our pipeline on the entire training set:
X_train, y_train = spaceTrain.drop(columns=['Transported']), spaceTrain[['Transported']]

xgb = XGBClassifier()
from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(xgb.get_params())

Parameters currently in use:

{'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'enable_categorical': False,
 'gamma': None,
 'gpu_id': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_delta_step': None,
 'max_depth': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'objective': 'binary:logistic',
 'predictor': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'use_label_encoder': True,
 'validate_parameters': None,
 'verbosity': None}


In [13]:
random_grid = {"xgbclassifier__learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
               "xgbclassifier__max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
               "xgbclassifier__min_child_weight" : [ 1, 3, 5, 7 ],
               "xgbclassifier__gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
               "xgbclassifier__colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ] }

In [14]:
temp_pipe = make_pipeline(preprocessor,XGBClassifier())
#Pipeline(steps = ['rf',RandomForestRegressor()])
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
temp_random = RandomizedSearchCV(temp_pipe,param_distributions = random_grid, n_jobs = -1)
# Fit the random search model
temp_random.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


RandomizedSearchCV(estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('pipeline-1',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer()),
                                                                                               ('standardscaler',
                                                                                                StandardScaler())]),
                                                                               ['Age',
                                                                                'RoomService',
                                                                                'FoodCourt',
                                                                                'ShoppingMall',
           

In [15]:
temp_random.best_params_

{'xgbclassifier__min_child_weight': 3,
 'xgbclassifier__max_depth': 3,
 'xgbclassifier__learning_rate': 0.25,
 'xgbclassifier__gamma': 0.4,
 'xgbclassifier__colsample_bytree': 0.5}

In [19]:
xgb_pipe = make_pipeline(preprocessor, XGBClassifier(min_child_weight=3,max_depth=3,learning_rate=.25,gamma=.4,colsample_bytree=.5))

In [21]:
xgb_pipe.fit(X_train,y_train)
mean_std_cross_val_scores(xgb_pipe,X_train,y_train,return_train_score=True)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


fit_time       0.242 (+/- 0.007)
score_time     0.026 (+/- 0.003)
test_score     0.785 (+/- 0.023)
train_score    0.838 (+/- 0.006)
dtype: object

In [29]:
X_train, y_train = spaceTrain.drop(columns=['Transported']), spaceTrain[['Transported']]
xgb_pipe.fit(X_train, y_train)
# Manual transformation
spaceTest = cabinUpdate(spaceTest).copy()
# Make our predictions:
spaceshipPredictions = xgb_pipe.predict(spaceTest)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


F    1445
G    1222
E     447
B     362
C     355
D     242
A      98
T       6
Name: Cabin-1, dtype: int64
4       21
31      18
197     16
294     16
228     14
        ..
1170     1
904      1
1174     1
356      1
1503     1
Name: Cabin-2, Length: 1505, dtype: int64
S    2093
P    2084
Name: Cabin-3, dtype: int64


In [30]:
result = zip(spaceTest['PassengerId'], pd.Series(spaceshipPredictions))
out = pd.DataFrame(result, columns=['PassengerId', 'Transported']).sort_values('PassengerId')
out.to_csv('./results/xgb_hyper.csv', index=False)
out.describe()

Unnamed: 0,PassengerId,Transported
count,4277,4277
unique,4277,2
top,0013_01,True
freq,1,2261
