# Reading data

In [470]:
import pandas as pd


df_train = pd.read_csv("./titanic/train.csv") #, index_col='PassengerId')
df_test = pd.read_csv("./titanic/test.csv")#, index_col='PassengerId')

target = 'Survived'

test_ids = df_test["PassengerId"]


df_train.head()
#df_test.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Review the data

In [471]:
df_train.info()
# df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


# Select features

In [472]:
del_features = ["Name", "Ticket", "Cabin","PassengerId"]
df_train = df_train.drop(del_features, axis=1)
df_test = df_test.drop(del_features, axis=1)

df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


# Remove instances with missing target

In [473]:
if df_train[target].isnull().values.any():
  print(f"\n {df_train[target].isnull().sum()} instances are missing the target value in train data. Dropping the instances." )
  df_train.dropna(axis=0, subset=[target], how='any', inplace=True)
else:
  print("\n There is no missing target in the train data.")



 There is no missing target in the train data.


# Select numerical and object features

In [474]:
objectFeatures = df_train.select_dtypes(include=['object']).columns.to_list()# train object features 

if target in objectFeatures:  # drop the target columns
      objectFeatures.remove(target)
print("Here are the object features: \n", objectFeatures)
    
numericFeatures = df_train.select_dtypes(exclude=['object']).columns.to_list()
if target in numericFeatures:  # drop the target columns
      numericFeatures.remove(target)
print("\n Here are the numerical features: \n", numericFeatures)

Here are the object features: 
 ['Sex', 'Embarked']

 Here are the numerical features: 
 ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']


# Find out missing values

In [475]:
missing_val_count_by_column = (df_train.isnull().sum())
print("\n train data missing values: ")
print(missing_val_count_by_column[missing_val_count_by_column > 0].sort_values(ascending=False))

missing_val_count_by_column = (df_test.isnull().sum())
print("\n test data missing values: ")
print(missing_val_count_by_column[missing_val_count_by_column > 0].sort_values(ascending=False))


 train data missing values: 
Age         177
Embarked      2
dtype: int64

 test data missing values: 
Age     86
Fare     1
dtype: int64


# Select features with missing values in train dataset

In [476]:
objectFeatures_with_missing_data_train = [col for col in df_train[objectFeatures].columns if df_train[col].isnull().any()] # categorrical features with missing data
if target in objectFeatures_with_missing_data_train:  # target should not be here, but let's check it out. 
  objectFeatures_with_missing_data_train.remove(target)

numericFeatures_with_missing_data_train = [col for col in df_train[numericFeatures].columns if df_train[col].isnull().any()] # numerical features with missing data
if target in numericFeatures_with_missing_data_train:  # target should not be here, but let's check it out. 
  numericFeatures_with_missing_data_train.remove(target)

Features_with_missing_data_train = objectFeatures_with_missing_data_train + numericFeatures_with_missing_data_train

print("\n object features with missing data in train: \n", objectFeatures_with_missing_data_train)
print("\n numerical features with missing data in train: \n", numericFeatures_with_missing_data_train)
print("\n Features with missing data (total {}) in train: \n".format(len(Features_with_missing_data_train)), Features_with_missing_data_train)



 object features with missing data in train: 
 ['Embarked']

 numerical features with missing data in train: 
 ['Age']

 Features with missing data (total 2) in train: 
 ['Embarked', 'Age']


# Select features with missing values in test dataset

In [477]:
objectFeatures_with_missing_data_test = [col for col in df_test[objectFeatures].columns if df_test[col].isnull().any()] # categorrical features with missing data in test
if target in objectFeatures_with_missing_data_test:  # target should not be here, but let's check it out. 
  objectFeatures_with_missing_data_test.remove(target)

numericFeatures_with_missing_data_test = [col for col in df_test[numericFeatures].columns if df_test[col].isnull().any()] # numerical features with missing data in test
if target in numericFeatures_with_missing_data_test:  # target should not be here, but let's check it out. 
  numericFeatures_with_missing_data_test.remove(target)

Features_with_missing_data_test = objectFeatures_with_missing_data_test + numericFeatures_with_missing_data_test

print("\n object features with missing data in test: \n", objectFeatures_with_missing_data_test)
print("\n numerical features with missing data in test: \n", numericFeatures_with_missing_data_test)
print("\n Features with missing data (total {}) in test: \n".format(len(Features_with_missing_data_train)), Features_with_missing_data_test)



 object features with missing data in test: 
 []

 numerical features with missing data in test: 
 ['Age', 'Fare']

 Features with missing data (total 2) in test: 
 ['Age', 'Fare']


# Handle missing values

In [478]:
objectFeatures_with_missing_data = list(set(objectFeatures_with_missing_data_test) | set(objectFeatures_with_missing_data_train)) 
numericFeatures_with_missing_data = list(set(numericFeatures_with_missing_data_test) | set(numericFeatures_with_missing_data_train))
Features_with_missing_data = list(set(Features_with_missing_data_test) | set(Features_with_missing_data_train)) 

print("objectFeatures_with_missing_data: ", objectFeatures_with_missing_data)
print("numericFeatures_with_missing_data: ", numericFeatures_with_missing_data)
print("Features_with_missing_data:",Features_with_missing_data)

missing_val_count_by_column = (df_train.isnull().sum())
print("\n train data missing values: ")
print(missing_val_count_by_column[missing_val_count_by_column > 0].sort_values(ascending=False))


missing_val_count_by_column = (df_test.isnull().sum())
print("\n test data missing values: ")
print(missing_val_count_by_column[missing_val_count_by_column > 0].sort_values(ascending=False))


# add missing values feature to datasets
for col in Features_with_missing_data:
  df_train[col + '_was_missing'] = df_train[col].isnull().astype(int)
  df_test[col + '_was_missing'] = df_test[col].isnull().astype(int)

df_train['Embarked'].fillna('U',inplace=True)


print("train\n",df_train.head())
print("train\n",df_train.columns)
print()
print("\n test\n",df_test.head())
print("test2\n",df_test.columns)

from sklearn.impute import SimpleImputer

imputer_train = SimpleImputer(strategy='mean', copy=False)
imputer_test = SimpleImputer(strategy='mean', copy=False)

df_train[numericFeatures_with_missing_data_train] = pd.DataFrame(imputer_train.fit_transform(df_train[numericFeatures_with_missing_data_train]), columns = numericFeatures_with_missing_data_train)
df_test [numericFeatures_with_missing_data_test] =  pd.DataFrame(imputer_test.fit_transform( df_test[ numericFeatures_with_missing_data_test]),  columns = numericFeatures_with_missing_data_test)

print("\n\n done")
missing_val_count_by_column = (df_train.isnull().sum())
print("\n train data missing values: ")
print(missing_val_count_by_column[missing_val_count_by_column > 0].sort_values(ascending=False))


missing_val_count_by_column = (df_test.isnull().sum())
print("\n test data missing values: ")
print(missing_val_count_by_column[missing_val_count_by_column > 0].sort_values(ascending=False))

# print("train: \n", df_train.head())
# print("test: \n", df_test.head())
# df_test.to_csv("testout.csv")
# df_train.to_csv("trainout.csv")


objectFeatures_with_missing_data:  ['Embarked']
numericFeatures_with_missing_data:  ['Fare', 'Age']
Features_with_missing_data: ['Fare', 'Embarked', 'Age']

 train data missing values: 
Age         177
Embarked      2
dtype: int64

 test data missing values: 
Age     86
Fare     1
dtype: int64
train
    Survived  Pclass     Sex   Age  SibSp  Parch     Fare Embarked  \
0         0       3    male  22.0      1      0   7.2500        S   
1         1       1  female  38.0      1      0  71.2833        C   
2         1       3  female  26.0      0      0   7.9250        S   
3         1       1  female  35.0      1      0  53.1000        S   
4         0       3    male  35.0      0      0   8.0500        S   

   Fare_was_missing  Embarked_was_missing  Age_was_missing  
0                 0                     0                0  
1                 0                     0                0  
2                 0                     0                0  
3                 0                    

# Encode categorical features

In [479]:
print("objectFeatures:\n",objectFeatures)
unique_cats_of_objectFeatures = list(map(lambda col: df_train[col].nunique(), objectFeatures))
d = dict(zip(objectFeatures, unique_cats_of_objectFeatures))

print("unique categories of object features: ")
print(sorted(d.items(), key = lambda x:x[1]))

#==============
# from sklearn.preprocessing import OrdinalEncoder
# ordinal_encoder = OrdinalEncoder()

# ordinal_encoder.fit(df_train[objectFeatures])

# df_train[objectFeatures] = ordinal_encoder.transform(df_train[objectFeatures])
# df_test[objectFeatures] = ordinal_encoder.transform(df_test[objectFeatures])

# print("train: \n", df_train.head())
# print("test: \n", df_test.head())
# df_test.to_csv("testout.csv")
# df_train.to_csv("trainout.csv")


from sklearn.preprocessing import OneHotEncoder
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) #, feature_name_combiner='concat')
OH_encoder.fit(df_train[objectFeatures])
new_feature_names= OH_encoder.get_feature_names_out(objectFeatures)
      
OH_df_train = pd.DataFrame(OH_encoder.transform(df_train[objectFeatures]), columns=new_feature_names )

df_train.drop(objectFeatures, axis=1,inplace=True)
df_train = df_train.join(OH_df_train)


OH_df_test = pd.DataFrame(OH_encoder.transform(df_test[objectFeatures]), columns=new_feature_names )

df_test.drop(objectFeatures, axis=1,inplace=True)
df_test = df_test.join(OH_df_test)    

df_train.head()

objectFeatures:
 ['Sex', 'Embarked']
unique categories of object features: 
[('Sex', 2), ('Embarked', 4)]


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Fare_was_missing,Embarked_was_missing,Age_was_missing,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_U
0,0,3,22.0,1,0,7.25,0,0,0,0.0,1.0,0.0,0.0,1.0,0.0
1,1,1,38.0,1,0,71.2833,0,0,0,1.0,0.0,1.0,0.0,0.0,0.0
2,1,3,26.0,0,0,7.925,0,0,0,1.0,0.0,0.0,0.0,1.0,0.0
3,1,1,35.0,1,0,53.1,0,0,0,1.0,0.0,0.0,0.0,1.0,0.0
4,0,3,35.0,0,0,8.05,0,0,0,0.0,1.0,0.0,0.0,1.0,0.0


# Split

In [480]:
from sklearn.model_selection import train_test_split
train, valid = train_test_split(df_train, train_size=0.8 , random_state=50, shuffle=True)
    
x_train = train.loc[:, train.columns != target]
y_train = train.loc[:, train.columns == target]

x_valid = valid.loc[:, valid.columns != target]
y_valid = valid.loc[:, valid.columns == target]

# Training the model XGBoost

In [481]:
from xgboost import XGBClassifier

print("\n Extended Gradient Boost Classifier: ")

# simple extended gradient boost model
# my_model = XGBClassifier(random_state=0)
# my_model.fit(data.x_train, data.y_train)

n_estimators = 50
early_stopping_rounds = 5 
learning_rate = 0.05
n_jobs = 1

# complex extended gradient boost model
my_model = XGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate, n_jobs = n_jobs, random_state=0)
my_model.fit(x_train, y_train,
          early_stopping_rounds=early_stopping_rounds, 
          eval_set=[(x_valid, y_valid)],
          verbose=False)


y_pred = my_model.predict(x_valid)

from sklearn.metrics import accuracy_score

print("accuracy score for validation:\n",accuracy_score(y_valid,y_pred))

y_test = my_model.predict(df_test)

submission = pd.DataFrame({"PassengerId":test_ids.values, 
                           'Survived':y_test
                           })

submission.to_csv("submission2.csv", index=False)






 Extended Gradient Boost Classifier: 
accuracy score for validation:
 0.8100558659217877




# Random forest

In [488]:
from sklearn.ensemble import RandomForestClassifier
n_estimators: int = 200
forest_model = RandomForestClassifier(n_estimators = n_estimators, max_depth=15,random_state=1)
forest_model.fit(x_train, y_train)
y_pred2 = forest_model.predict(x_valid)

print("accuracy score for validation:\n",accuracy_score(y_valid,y_pred2))

y_test = forest_model.predict(df_test)

submission = pd.DataFrame({"PassengerId":test_ids.values, 
                           'Survived':y_test
                           })

submission.to_csv("submission3.csv", index=False)



  forest_model.fit(x_train, y_train)


accuracy score for validation:
 0.7988826815642458
