In [66]:
import ydf
import pandas as pd
import numpy as np


In [67]:
train_data = pd.read_csv("input/train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [68]:
test_data = pd.read_csv("input/test.csv")
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
from sklearn.ensemble import RandomForestClassifier

y = train_data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your first submission was successfully saved!")

Your first submission was successfully saved!


In [203]:
def preprocess(df):
    df = df.copy()
    
    def normalize_name(x):
        return " ".join([v.strip(",()[].\"'") for v in x.split(" ")])
    
    def ticket_number(x):
        return x.split(" ")[-1]
        
    def ticket_item(x):
        items = x.split(" ")
        if len(items) == 1:
            return "NONE"
        return "_".join(items[0:-1])
    
    # Normalize name
    df["Name"] = df["Name"].apply(normalize_name)

    # Parse ticket number and ticket item
    df["Ticket_number"] = df["Ticket"].apply(ticket_number)
    df["Ticket_item"] = df["Ticket"].apply(ticket_item)

     # Add FamilySize feature
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    
    # Add IsAlone feature
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
  
    # Fare binning
    df['FareBin'] = pd.qcut(df['Fare'], q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High'])
    
    # Cabin processing
    df['HasCabin'] = (~df['Cabin'].isna()).astype(int)
    df['CabinDeck'] = df['Cabin'].fillna('Unknown').str[0]
    
    # Name length
    df['NameLength'] = df['Name'].apply(len)
    
    # Fare per person
    df['FarePerPerson'] = df['Fare'] / df['FamilySize']
    
    # Is child
    df['IsChild'] = (df['Age'] < 12).astype(int)
                         
    return df

preprocessed_train_df = preprocess(train_data)
preprocessed_serving_df = preprocess(test_data)

preprocessed_train_df.head(5)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Ticket_number,Ticket_item,FamilySize,IsAlone,FareBin,HasCabin,CabinDeck,NameLength,FarePerPerson,IsChild
0,1,0,3,Braund Mr Owen Harris,male,22.0,1,0,A/5 21171,7.25,...,21171,A/5,2,0,Low,0,U,21,3.625,0
1,2,1,1,Cumings Mrs John Bradley Florence Briggs Thayer,female,38.0,1,0,PC 17599,71.2833,...,17599,PC,2,0,High,1,C,47,35.64165,0
2,3,1,3,Heikkinen Miss Laina,female,26.0,0,0,STON/O2. 3101282,7.925,...,3101282,STON/O2.,1,1,Medium-Low,0,U,20,7.925,0
3,4,1,1,Futrelle Mrs Jacques Heath Lily May Peel,female,35.0,1,0,113803,53.1,...,113803,NONE,2,0,High,1,C,40,26.55,0
4,5,0,3,Allen Mr William Henry,male,35.0,0,0,373450,8.05,...,373450,NONE,1,1,Medium-Low,0,U,22,8.05,0


In [204]:
preprocessed_serving_df.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,...,Ticket_number,Ticket_item,FamilySize,IsAlone,FareBin,HasCabin,CabinDeck,NameLength,FarePerPerson,IsChild
0,892,3,Kelly Mr James,male,34.5,0,0,330911,7.8292,,...,330911,NONE,1,1,Low,0,U,14,7.8292,0
1,893,3,Wilkes Mrs James Ellen Needs,female,47.0,1,0,363272,7.0,,...,363272,NONE,2,0,Low,0,U,28,3.5,0
2,894,2,Myles Mr Thomas Francis,male,62.0,0,0,240276,9.6875,,...,240276,NONE,1,1,Medium-Low,0,U,23,9.6875,0
3,895,3,Wirz Mr Albert,male,27.0,0,0,315154,8.6625,,...,315154,NONE,1,1,Medium-Low,0,U,14,8.6625,0
4,896,3,Hirvonen Mrs Alexander Helga E Lindqvist,female,22.0,1,1,3101298,12.2875,,...,3101298,NONE,3,0,Medium-Low,0,U,40,4.095833,0


In [205]:
input_features = list(preprocessed_train_df.columns)
input_features.remove("Ticket")
input_features.remove("PassengerId")
input_features.remove("Survived")
input_features.remove("Ticket_number")

print(f"Input features: {input_features}")


Input features: ['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked', 'Ticket_item', 'FamilySize', 'IsAlone', 'FareBin', 'HasCabin', 'CabinDeck', 'NameLength', 'FarePerPerson', 'IsChild']


In [230]:
# Train a Gradient Boosted Trees model
model = ydf.GradientBoostedTreesLearner(
    label="Survived", 
    growing_strategy="BEST_FIRST_GLOBAL",
    include_all_columns=False, 
    features=input_features, 
    categorical_algorithm="RANDOM", 
    split_axis="SPARSE_OBLIQUE",
    sparse_oblique_normalization="MIN_MAX",
    sparse_oblique_num_projections_exponent=2.0,
    num_trees=1000, 
    min_examples=1,
    shrinkage=0.05,
    random_seed=12365556,
    # validation_ratio=0.0,
    num_candidate_attributes_ratio=0.2,
    max_depth=6,
    compute_permutation_variable_importance=True).train(preprocessed_train_df)

self_evaluation = model.evaluate(preprocessed_train_df)
print(f"Accuracy: {self_evaluation.accuracy} Loss:{self_evaluation.loss}")

Train model on 891 examples
Model trained in 0:00:00.412495
Accuracy: 0.9517396184062851 Loss:0.2042815154658758


In [232]:
# Train the model
model = ydf.RandomForestLearner(
    label="Survived", 
    features=input_features,
    winner_take_all=True, 
    num_trees=1000,
    categorical_algorithm="RANDOM",
    split_axis="SPARSE_OBLIQUE",
    sparse_oblique_normalization="MIN_MAX",
    sparse_oblique_num_projections_exponent=2.0,
    compute_oob_performances=True,
    compute_oob_variable_importances=True
).train(preprocessed_train_df)

# Evaluate the model
self_evaluation = model.evaluate(preprocessed_train_df)
print(f"Accuracy: {self_evaluation.accuracy} Loss: {self_evaluation.loss}")

Train model on 891 examples
Model trained in 0:00:01.126532
Accuracy: 0.9517396184062851 Loss: 0.16783222223905894


In [233]:
# Look at a model (input features, training logs, structure, etc.)
model.describe()

In [234]:
# Generate predictions
predictions = model.predict(preprocessed_serving_df)

predictions

array([0.033     , 0.28799984, 0.07799998, 0.14400011, 0.5439966 ,
       0.17500018, 0.52099687, 0.07099997, 0.5659963 , 0.09400001,
       0.015     , 0.09100001, 0.94999135, 0.05199997, 0.99699074,
       0.84099275, 0.06499995, 0.03999999, 0.24000031, 0.31799945,
       0.12800008, 0.7829935 , 0.8909921 , 0.50799704, 0.8519926 ,
       0.08499999, 0.978991  , 0.022     , 0.4859973 , 0.27999994,
       0.032     , 0.04199998, 0.45599768, 0.30199966, 0.49799713,
       0.40999827, 0.18000019, 0.2839999 , 0.04499998, 0.6579951 ,
       0.11900006, 0.60699576, 0.17300017, 0.68699473, 0.9929908 ,
       0.1410001 , 0.37499872, 0.1840002 , 0.9539913 , 0.49899712,
       0.52099687, 0.07299997, 0.8349928 , 0.87299234, 0.21700026,
       0.01      , 0.016     , 0.24000031, 0.019     , 0.977991  ,
       0.005     , 0.12000006, 0.10200003, 0.7149944 , 0.44399783,
       0.80899316, 0.71699435, 0.27300003, 0.4999971 , 0.7839935 ,
       0.820993  , 0.019     , 0.4239981 , 0.5719962 , 0.94999

In [235]:
# Create submission file
output = pd.DataFrame({'PassengerId': preprocessed_serving_df.PassengerId, 'Survived': (predictions > 0.5).astype(int)})
output.to_csv('submission.csv', index=False)
print("Submission file created successfully!")

Submission file created successfully!
