In [6]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier

In [7]:
#Now let's load in our training data:

train_data = pd.read_csv("titanic_train.csv")
#test_data = pd.read_csv("test.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [8]:
y_train = train_data["Survived"]

train_data.drop(labels="Survived", axis=1, inplace=True)
full_data = train_data
full_data.shape

(891, 11)

In [9]:

# drop any columns that aren't necessary or helpful for training, although you could leave them in and see how they affect things:

drop_columns = ["Name", "Age", "SibSp", "Ticket", "Cabin", "Parch", "Embarked"]
full_data.drop(labels=drop_columns, axis=1, inplace=True)
#Any text data needs to be converted into numbers that our model can use, so let's change that now. We'll also fill any empty cells with 0:

full_data = pd.get_dummies(full_data, columns=["Sex"])
full_data.fillna(value=0.0, inplace=True)

In [11]:
X_train = full_data.values[0:691]
y_train=y_train.values[0:691]
X_test = full_data.values[691:]
#We'll now scale our data by creating an instance of the scaler and scaling it:

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
#Now we can split the data into training and testing sets. Let's also set a seed (so you can replicate the results) and select the percentage of the data for testing on:

state = 12  
test_size = 0.30  
  
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,test_size=test_size, random_state=state)

In [12]:
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=2, max_depth=2, random_state=0)
    gb_clf.fit(X_train, y_train)

    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb_clf.score(X_val, y_val)))

Learning rate:  0.05
Accuracy score (training): 0.787
Accuracy score (validation): 0.755
Learning rate:  0.075
Accuracy score (training): 0.822
Accuracy score (validation): 0.736
Learning rate:  0.1
Accuracy score (training): 0.824
Accuracy score (validation): 0.731
Learning rate:  0.25
Accuracy score (training): 0.839
Accuracy score (validation): 0.750
Learning rate:  0.5
Accuracy score (training): 0.859
Accuracy score (validation): 0.745
Learning rate:  0.75
Accuracy score (training): 0.851
Accuracy score (validation): 0.745
Learning rate:  1
Accuracy score (training): 0.870
Accuracy score (validation): 0.731


In [13]:
#Now we can evaluate the classifier by checking its accuracy and creating a confusion matrix. Let's create a new classifier and specify the best learning rate we discovered.

gb_clf2 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, max_features=2, max_depth=2, random_state=0)
gb_clf2.fit(X_train, y_train)
predictions = gb_clf2.predict(X_val)

print("Confusion Matrix:")
print(confusion_matrix(y_val, predictions))

print("Classification Report")
print(classification_report(y_val, predictions))

Confusion Matrix:
[[97 18]
 [35 58]]
Classification Report
              precision    recall  f1-score   support

           0       0.73      0.84      0.79       115
           1       0.76      0.62      0.69        93

    accuracy                           0.75       208
   macro avg       0.75      0.73      0.74       208
weighted avg       0.75      0.75      0.74       208



#XGBoost Classifier
Now we'll experiment with the XGBoost classifier.

As before, let's start by importing the libraries we need.

In [None]:


from xgboost import XGBClassifier
#Since our data is already prepared, we just need to fit the classifier with the training data:

xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train)
Now that the classifier has been fit and trained, we can check the score it achieves on the validation set by using the score command.

score = xgb_clf.score(X_val, y_val)
print(score)


Here's the output:

0.7761194029850746
Alternatively, you could predict the X_val data and then check the accuracy against the 
y_val by using accuracy_score. It should give you the same kind of result.

Comparing the accuracy of XGboost to the accuracy of a regular gradient classifier shows that, in this case, the results were very similar. However, this won't always be the case and in different circumstances, one of the classifiers could easily perform better than the other. Try varying the arguments in this model to see how the result differ