In [1]:
import ydf
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
train_data = pd.read_csv("input/train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
test_data = pd.read_csv("input/test.csv")
test_data.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
# Impute missing ages with median
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
test_data['Age'].fillna(test_data['Age'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['Age'].fillna(test_data['Age'].median(), inplace=True)


In [25]:
def preprocess(df):
    df = df.copy()
    
    def normalize_name(x):
        return " ".join([v.strip(",()[].\"'") for v in x.split(" ")])
    
    def ticket_number(x):
        return x.split(" ")[-1]
        
    def ticket_item(x):
        items = x.split(" ")
        if len(items) == 1:
            return "NONE"
        return "_".join(items[0:-1])
    
    # Normalize name
    df["Name"] = df["Name"].apply(normalize_name)

    # Parse ticket number and ticket item
    df["Ticket_number"] = df["Ticket"].apply(ticket_number)
    # df["Ticket_item"] = df["Ticket"].apply(ticket_item)

    # Add FamilySize feature
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    
    # Add IsAlone feature
    # df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    
    # Cabin processing
    # Create a 'CabinType' feature
    df['HasCabin'] = (~df['Cabin'].isna()).astype(int)
    df['CabinLetter'] = df['Cabin'].fillna('U').apply(lambda x: x[0])
    df['CabinType'] = df['Cabin'].apply(lambda x: x[0] if isinstance(x, str) else 'Unknown')
    # fill in NaN values in Cabin
    df['Cabin'] = df['Cabin'].fillna('Unknown')
       
    # Name length
    # df['NameLength'] = df['Name'].apply(len)
    
    # Fare per person
    df['FarePerPerson'] = df['Fare'] / df['FamilySize']
    
    # Is child
    df['IsChild'] = (df['Age'] < 12).astype(int)
    
    # Handle NaN values in Embarked
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

    # Handle NaN values in Fare (which affects FarePerPerson)
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    
    # Recalculate FarePerPerson after handling NaN in Fare
    df['FarePerPerson'] = df['Fare'] / df['FamilySize']
    
    # Handle any remaining NaN in FarePerPerson (in case of division by zero)
    df['FarePerPerson'] = df['FarePerPerson'].fillna(df['FarePerPerson'].median())
                         
    return df

preprocessed_train_df = preprocess(train_data)
preprocessed_serving_df = preprocess(test_data)

preprocessed_train_df.head(10)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Ticket_number,FamilySize,HasCabin,CabinLetter,CabinType,FarePerPerson,IsChild
0,1,0,3,Braund Mr Owen Harris,male,22.0,1,0,A/5 21171,7.25,Unknown,S,21171,2,0,U,Unknown,3.625,0
1,2,1,1,Cumings Mrs John Bradley Florence Briggs Thayer,female,38.0,1,0,PC 17599,71.2833,C85,C,17599,2,1,C,C,35.64165,0
2,3,1,3,Heikkinen Miss Laina,female,26.0,0,0,STON/O2. 3101282,7.925,Unknown,S,3101282,1,0,U,Unknown,7.925,0
3,4,1,1,Futrelle Mrs Jacques Heath Lily May Peel,female,35.0,1,0,113803,53.1,C123,S,113803,2,1,C,C,26.55,0
4,5,0,3,Allen Mr William Henry,male,35.0,0,0,373450,8.05,Unknown,S,373450,1,0,U,Unknown,8.05,0
5,6,0,3,Moran Mr James,male,28.0,0,0,330877,8.4583,Unknown,Q,330877,1,0,U,Unknown,8.4583,0
6,7,0,1,McCarthy Mr Timothy J,male,54.0,0,0,17463,51.8625,E46,S,17463,1,1,E,E,51.8625,0
7,8,0,3,Palsson Master Gosta Leonard,male,2.0,3,1,349909,21.075,Unknown,S,349909,5,0,U,Unknown,4.215,1
8,9,1,3,Johnson Mrs Oscar W Elisabeth Vilhelmina Berg,female,27.0,0,2,347742,11.1333,Unknown,S,347742,3,0,U,Unknown,3.7111,0
9,10,1,2,Nasser Mrs Nicholas Adele Achem,female,14.0,1,0,237736,30.0708,Unknown,C,237736,2,0,U,Unknown,15.0354,0


In [26]:
preprocessed_serving_df.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Ticket_number,FamilySize,HasCabin,CabinLetter,CabinType,FarePerPerson,IsChild
0,892,3,Kelly Mr James,male,34.5,0,0,330911,7.8292,Unknown,Q,330911,1,0,U,Unknown,7.8292,0
1,893,3,Wilkes Mrs James Ellen Needs,female,47.0,1,0,363272,7.0,Unknown,S,363272,2,0,U,Unknown,3.5,0
2,894,2,Myles Mr Thomas Francis,male,62.0,0,0,240276,9.6875,Unknown,Q,240276,1,0,U,Unknown,9.6875,0
3,895,3,Wirz Mr Albert,male,27.0,0,0,315154,8.6625,Unknown,S,315154,1,0,U,Unknown,8.6625,0
4,896,3,Hirvonen Mrs Alexander Helga E Lindqvist,female,22.0,1,1,3101298,12.2875,Unknown,S,3101298,3,0,U,Unknown,4.095833,0


In [27]:
input_features = list(preprocessed_train_df.columns)
input_features.remove("Ticket")
input_features.remove("PassengerId")
input_features.remove("Survived")
input_features.remove("Ticket_number")
# input_features.remove("Embarked")
# input_features.remove("FamilySize")
# input_features.remove("FarePerPerson")

print(f"Input features: {input_features}")


Input features: ['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked', 'FamilySize', 'HasCabin', 'CabinLetter', 'CabinType', 'FarePerPerson', 'IsChild']


In [28]:
# sklearn models
# encode features
features = input_features

# One-hot encode categorical variables
X = pd.get_dummies(preprocessed_train_df[features], drop_first=True)
X_test = pd.get_dummies(preprocessed_serving_df[features], drop_first=True)

# Ensure X and X_test have the same columns
X, X_test = X.align(X_test, join='outer', axis=1, fill_value=0)

y = preprocessed_train_df['Survived']

In [29]:
def check_nan(df):
    nan_columns = df.columns[df.isna().any()].tolist()
    if nan_columns:
        print("Columns with NaN values:")
        for col in nan_columns:
            nan_count = df[col].isna().sum()
            print(f"{col}: {nan_count} NaN values")
    else:
        print("No NaN values found in the DataFrame")

# Use this function on your preprocessed DataFrames
print("Training data:")
check_nan(preprocessed_train_df)
print("\nTest data:")
check_nan(preprocessed_serving_df)

Training data:
No NaN values found in the DataFrame

Test data:
No NaN values found in the DataFrame


In [35]:
# hyperparameter tuning with sklearn
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2]
}

gb = GradientBoostingClassifier(random_state=42)
grid_search = GridSearchCV(gb, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X, y)

best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)

predictions

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [31]:
# analyze feature importance
importances = best_model.feature_importances_
feature_imp = pd.DataFrame({'feature': X.columns, 'importance': importances})
feature_imp = feature_imp.sort_values('importance', ascending=False)
print(feature_imp)

                                        feature  importance
1516                                   Sex_male    0.353668
1515                                     Pclass    0.082975
0                                           Age    0.081675
206                               FarePerPerson    0.054721
204                                  FamilySize    0.024403
...                                         ...         ...
1481         Name_Williams Mr Richard Norris II    0.000000
1482  Name_Williams-Lambert Mr Fletcher Fellows    0.000000
1483               Name_Wilson Miss Helen Alice    0.000000
1484                     Name_Windelov Mr Einar    0.000000
1517                                      SibSp    0.000000

[1518 rows x 2 columns]


In [72]:
# ydf models
# Train a Gradient Boosted Trees model
model = ydf.GradientBoostedTreesLearner(
    label="Survived", 
    growing_strategy="BEST_FIRST_GLOBAL",
    include_all_columns=False, 
    features=input_features, 
    categorical_algorithm="RANDOM", 
    split_axis="SPARSE_OBLIQUE",
    sparse_oblique_normalization="MIN_MAX",
    sparse_oblique_num_projections_exponent=2.0,
    num_trees=1000, 
    min_examples=1,
    shrinkage=0.05,
    random_seed=12365556,
    # validation_ratio=0.0,
    num_candidate_attributes_ratio=0.2,
    max_depth=6,
    compute_permutation_variable_importance=True).train(preprocessed_train_df)

self_evaluation = model.evaluate(preprocessed_train_df)
print(f"Accuracy: {self_evaluation.accuracy} Loss:{self_evaluation.loss}")

Train model on 891 examples
Model trained in 0:00:00.664761
Accuracy: 0.9438832772166106 Loss:0.19411579040365187


In [73]:
# Train the model
model = ydf.RandomForestLearner(
    label="Survived", 
    features=input_features,
    winner_take_all=True, 
    num_trees=1000,
    categorical_algorithm="RANDOM",
    split_axis="SPARSE_OBLIQUE",
    sparse_oblique_normalization="MIN_MAX",
    sparse_oblique_num_projections_exponent=2.0,
    compute_oob_performances=True,
    compute_oob_variable_importances=True
).train(preprocessed_train_df)

# Evaluate the model
self_evaluation = model.evaluate(preprocessed_train_df)
print(f"Accuracy: {self_evaluation.accuracy} Loss: {self_evaluation.loss}")

Train model on 891 examples
Model trained in 0:00:03.841668
Accuracy: 0.9270482603815937 Loss: 0.1878740243116015


In [37]:
# Look at a model (input features, training logs, structure, etc.)
model.describe()

NameError: name 'model' is not defined

In [75]:
# Generate predictions
predictions = model.predict(preprocessed_serving_df)

predictions

array([0.03999999, 0.22800028, 0.14900012, 0.36599883, 0.35699895,
       0.15400013, 0.26900008, 0.05899996, 0.75399387, 0.10400003,
       0.25000033, 0.14200011, 0.9909908 , 0.04899997, 0.97999096,
       0.9149918 , 0.018     , 0.31599948, 0.2829999 , 0.26200017,
       0.3069996 , 0.8609925 , 0.9629912 , 0.5199969 , 0.85799253,
       0.02      , 0.98999083, 0.2600002 , 0.67899483, 0.17600018,
       0.035     , 0.04999997, 0.4459978 , 0.21000025, 0.44499782,
       0.25600025, 0.36999878, 0.12200007, 0.032     , 0.6879947 ,
       0.089     , 0.76199377, 0.06499995, 0.9919908 , 0.975991  ,
       0.03999999, 0.5359967 , 0.2989997 , 0.9399915 , 0.37799868,
       0.4939972 , 0.09300001, 0.8819922 , 0.88699216, 0.089     ,
       0.011     , 0.004     , 0.018     , 0.028     , 0.9849909 ,
       0.05999995, 0.1880002 , 0.027     , 0.8779923 , 0.37399873,
       0.9049919 , 0.56999624, 0.04999997, 0.31999943, 0.79299337,
       0.70999444, 0.004     , 0.26400015, 0.2839999 , 0.98799

In [38]:
# Create submission file
output = pd.DataFrame({'PassengerId': preprocessed_serving_df.PassengerId, 'Survived': (predictions > 0.5).astype(int)})
output.to_csv('submission.csv', index=False)
print("Submission file created successfully!")

Submission file created successfully!


In [39]:
class_distribution = train_data['Survived'].value_counts(normalize=True)
print("Class distribution:\n", class_distribution)

Class distribution:
 Survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64


In [33]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)