### Python reproduction of Chris Deotte's R notebook
link to notebook: https://www.kaggle.com/code/cdeotte/titanic-using-name-only-0-81818
test-acc: 0.80382

In [1]:
import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_csv('train.csv', index_col='PassengerId')
train_df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
test_df = pd.read_csv('test.csv', index_col='PassengerId')
test_df.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
df = pd.concat([train_df, test_df], ignore_index=False)
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
def transform_dataset(df):
    df_copy = df.copy()

    # Convert "Sex" to Categorical
    df_copy['Sex'] = df_copy['Sex'].map(lambda x: 0 if x == 'female' else 1).astype(np.uint8)

    # Convert 'Embarked' to categorical
    embarked_dummies = pd.get_dummies(df_copy['Embarked'])
    df_copy.drop(columns=['Embarked'], inplace=True)

    for col in embarked_dummies.columns:
        df_copy[col] = embarked_dummies[col]

    # Convert 'Fare' to nullable float
    df_copy['Fare'] = (np.round(df_copy['Fare'])).astype('Float32')

    # Convert 'Age' to integer values
    df_copy['Age'] = np.floor(df_copy['Age']).astype('Int32')

    return df_copy

In [6]:
df = transform_dataset(df)
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,C,Q,S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0.0,3,"Braund, Mr. Owen Harris",1,22,1,0,A/5 21171,7.0,,0,0,1
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38,1,0,PC 17599,71.0,C85,1,0,0
3,1.0,3,"Heikkinen, Miss. Laina",0,26,0,0,STON/O2. 3101282,8.0,,0,0,1
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35,1,0,113803,53.0,C123,0,0,1
5,0.0,3,"Allen, Mr. William Henry",1,35,0,0,373450,8.0,,0,0,1


In [7]:
name_parts = df['Name'].str.extractall(r'([\w ]+), ?(.*)\. ?.*')

name_parts = name_parts.droplevel(1)
name_parts.columns = ['LastName', 'Title']

for col in name_parts.columns:
    name_parts[col] = name_parts[col].str.strip(' ')

name_parts.isna().sum()

LastName    0
Title       0
dtype: int64

In [8]:
df = df.join(name_parts)
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,C,Q,S,LastName,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,0.0,3,"Braund, Mr. Owen Harris",1,22,1,0,A/5 21171,7.0,,0,0,1,Braund,Mr
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38,1,0,PC 17599,71.0,C85,1,0,0,Cumings,Mrs
3,1.0,3,"Heikkinen, Miss. Laina",0,26,0,0,STON/O2. 3101282,8.0,,0,0,1,Heikkinen,Miss
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35,1,0,113803,53.0,C123,0,0,1,Futrelle,Mrs
5,0.0,3,"Allen, Mr. William Henry",1,35,0,0,373450,8.0,,0,0,1,Allen,Mr


In [9]:
df[df.Title.isna()]

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,C,Q,S,LastName,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1


In [10]:
# Convert titles as follows
# Master => boy
# Male other than master => man
# Female => woman

adult_man = (df['Title'] != 'Master') & (df['Sex'] == 1)

df.loc[adult_man, 'Title'] = 'man'
df.loc[df['Sex'] == 0, 'Title'] = 'woman'
df.loc[df['Title'] == 'Master', 'Title'] = 'boy'

df['Title'].value_counts()

man      782
woman    466
boy       61
Name: Title, dtype: int64

In [11]:
# Remove all males from the family size
df.loc[df.Title == 'man', 'LastName'] = 'None'

# Compute the family sizes
family_sizes = df.LastName.value_counts()
df['FamilySize'] = df.LastName.map(lambda name: family_sizes.loc[name])

# Remove the name of all the single people
df.loc[df['FamilySize'] <= 1, 'LastName'] = 'None'

In [12]:
index_before = (df.Title != 'man') & (df.LastName == 'None')
(index_before == True).sum()

264

In [13]:
# Search through tickets to see if we find a LastName for the women/children that are missing a lastname
for i, row in df[(df.Title != 'man') & (df.LastName == 'None')].iterrows():
    df.loc[i, 'LastName'] = df[df.Ticket == row.Ticket].iloc[0].LastName

In [14]:
index_after = (df.Title != 'man') & (df.LastName == 'None')
print((index_after == True).sum())
df[index_before & ~index_after]

254


Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,C,Q,S,LastName,Title,FamilySize
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
582,1.0,1,"Thayer, Mrs. John Borland (Marian Longstreth M...",0,39,1,1,17421,111.0,C68,1,0,0,Fleming,woman,1
610,1.0,1,"Shutes, Miss. Elizabeth W",0,40,0,0,PC 17582,153.0,C125,0,0,1,Graham,woman,1
709,1.0,1,"Cleaver, Miss. Alice",0,22,0,0,113781,152.0,,0,0,1,Allison,woman,1
880,1.0,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",0,56,0,1,11767,83.0,C50,1,0,0,Hays,woman,1
951,,1,"Chaudanson, Miss. Victorine",0,36,0,0,PC 17608,262.0,B61,1,0,0,Ryerson,woman,1
1033,,1,"Daniels, Miss. Sarah",0,33,0,0,113781,152.0,,0,0,1,Allison,woman,1
1042,,1,"Earnshaw, Mrs. Boulton (Olive Potter)",0,23,0,1,11767,83.0,C54,1,0,0,Hays,woman,1
1259,,3,"Riihivouri, Miss. Susanna Juhantytar Sanni""""",0,22,0,0,3101295,40.0,,0,0,1,Panula,woman,1
1263,,1,"Wilson, Miss. Helen Alice",0,31,0,0,16966,134.0,E39 E41,1,0,0,Spedden,woman,1
1267,,1,"Bowen, Miss. Grace Scott",0,45,0,0,PC 17608,262.0,,1,0,0,Ryerson,woman,1


In [15]:
# Compute 'woman-child-group' survival rates
survival_rate_by_family = df.loc[~df.Survived.isna()].groupby('LastName')['Survived'].mean()

df['LastNameSurvival'] = df['LastName'].map(lambda lastName: survival_rate_by_family[lastName] if lastName in survival_rate_by_family.index else np.nan)

In [16]:
result_df = pd.DataFrame({'PassengerId': test_df.index, 'Survived': np.zeros(len(test_df), dtype=np.int32)})

test_data = df.loc[test_df.index].reset_index()
result_df.loc[test_data.Sex == 0, 'Survived'] = 1
result_df.loc[(test_data.Title == 'boy') & (test_data.LastNameSurvival == 1), 'Survived'] = 1
result_df.loc[(test_data.Title == 'woman') & (test_data.LastNameSurvival == 0), 'Survived'] = 0

In [17]:
result_df.to_csv('test_wcg.csv', header=True, index=False)