In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample

### Step 1 - Reading the file train.csv into Python
#### (We skip test/train split this time, since kaggle provides us with test data)

In [2]:
df = pd.read_csv('~\\SPICED\\train.csv')
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [4]:
X = df.drop('Survived', axis=1)
y = df['Survived']

### Step 2 - Feature Engineering

In [5]:
# creating total relatives feature
def total_relatives(data):
    new_data = data['SibSp'] + data['Parch']
    return pd.DataFrame(new_data)

In [6]:
# creating a joint function for "Cabin" - getting a Deck Information
def get_initial(data):
    return pd.DataFrame(data['Cabin'].str[0])

encode_deck = make_pipeline(
     FunctionTransformer(get_initial),
     SimpleImputer(strategy='constant', fill_value='U'), # fills missing velue with 'Unknown'
     OrdinalEncoder()
     )

In [7]:
# creating a joint function for "Embarked"
impute_then_encode = make_pipeline(
    # SimpleImputer(strategy='constant', fill_value='Unknown'), - fills missing velue with 'Unknown'
    SimpleImputer(strategy='most_frequent'),
    OrdinalEncoder()
)

In [8]:
# Using Column Transformer

fe = ColumnTransformer([
    
    # let's pass the "SibSp", "Parch" and "Pclass" through for the moment
    ('do_nothing', 'passthrough', ['Pclass', 'SibSp', 'Parch']), 
    
    # let's fill the age gaps
    ('fill_gaps', SimpleImputer(strategy='median'), ['Age', 'Fare']),
    
    # let's encode the 'Sex'
    ('encode_sex', OrdinalEncoder(), ['Sex']),
    
    # summing relatived seems to be a stupid idea
    ('total_relatives', FunctionTransformer(total_relatives), ['SibSp', 'Parch']),
    
    # let's deal with the cabin ATTENTION! - this probably overfits the model
    #('get deck and encode', encode_deck, ['Cabin']),
    
    # fills the gaps in "Embarked" with most frequent, then convert it to numbers
    ('impute and encode', impute_then_encode, ['Embarked']),
])

In [9]:
# training and applying our Column Transformer
fe.fit(X)
X_trans = fe.transform(X)

In [11]:
X_trans[:3]

array([[ 3.    ,  1.    ,  0.    , 22.    ,  7.25  ,  1.    ,  1.    ,
         2.    ],
       [ 1.    ,  1.    ,  0.    , 38.    , 71.2833,  0.    ,  1.    ,
         0.    ],
       [ 3.    ,  0.    ,  0.    , 26.    ,  7.925 ,  0.    ,  0.    ,
         2.    ]])

### Step 3 - Create and Fit our Random Forest Model

In [12]:
mrf = RandomForestClassifier(max_depth=17, n_estimators=100, random_state=42)
mrf.fit(X_trans, y)

RandomForestClassifier(max_depth=17, random_state=42)

### Step 4 - Model Validation: Bootstrap

In [13]:
round(mrf.score(X_trans, y), 3)  # --> train accuracy

0.98

In [196]:
boots = []
for i in range(1000):
    # Resample the original data to create a "new" dataset
    Xb, yb = resample(X_trans, y)
    
    b = round(0.8 * (Xb.shape[0]))
    
    # Split the data into training and validation set
    Xb_train = Xb[:b]
    yb_train = yb[:b]
    Xb_validation = Xb[b:]
    yb_validation = yb[b:]

    # Fit the model and calculate the validation score
    mrf.fit(Xb_train, yb_train)
    score = mrf.score(Xb_validation, yb_validation)
    boots.append(score)
    #print(i, score)

In [14]:
boots_df = pd.DataFrame(boots)
boots_df.hist()

NameError: name 'boots' is not defined

In [198]:
boots_df.describe()

Unnamed: 0,0
count,1000.0
mean,0.900326
std,0.023079
min,0.820225
25%,0.88764
50%,0.898876
75%,0.91573
max,0.97191


### Step 5 - Evaluating the Model, Getting result (csv) file

In [15]:
# loading the test data
test = pd.read_csv('~\\SPICED\\test.csv')

# transforming the test data
test_trans = fe.transform(test)

In [16]:
test.count()

PassengerId    418
Pclass         418
Name           418
Sex            418
Age            332
SibSp          418
Parch          418
Ticket         418
Fare           417
Cabin           91
Embarked       418
dtype: int64

In [202]:
pd.DataFrame({'Survived':mrf.predict(test_trans)})

Unnamed: 0,Survived
0,0
1,0
2,0
3,1
4,0
...,...
413,0
414,1
415,0
416,0


In [204]:
df_res = pd.DataFrame()
df_res['PassengerId'] = test['PassengerId']
df_res['Survived'] = pd.DataFrame({'Survived':mrf.predict(test_trans)})['Survived']
#df_res.set_index('PassengerId', inplace=True)

In [205]:
df_res

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [206]:
df_res.to_csv('results_random_forest.csv', index=False)

Kaggle Results for this model: 0.74401, which is not really good.