# Titanic - Kaggle Notebook-----Gradient Boosting

## Importing our libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib 
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier




## Importing our datasets and splitting X and Y

In [2]:
data=pd.read_csv('../data/raw/train.csv')
data_test=pd.read_csv('../data/raw/test.csv')
data.columns = data.columns.str.replace(' ', '')
#Dropping the NaNs for Embarked
data_test.shape
data=data.dropna(subset=["Embarked"])
X = data.drop('Survived', axis=1)
X = X.append(data_test)
y=data.Survived
data_test.shape #Our y_Pred needs to have 418 as length

(418, 11)

In [3]:
#Here we can se the totals of NaNs
X.isnull().sum()

PassengerId       0
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          0
dtype: int64

In [4]:
#Checking the types of data
dataTypeSeries = X.dtypes
dataTypeSeries

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

## Data Preprocessing

#### Taking care of NaNs

In [5]:
#Dropping NaNs for Age
imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr = imr.fit(X[['Age']])
X["Age"]=imr.transform(X[["Age"]])

In [6]:
#Changing object Sex to Numeric 
X.Sex=X['Sex'].map({'female': 1, 'male': 0})
X.Sex=X.Sex

In [7]:
dummy=pd.get_dummies(X["Embarked"])
X=pd.concat([X,dummy],axis=1)

for i in people:
    X.Cabin.iloc[i-1]=1
for i in range(len(X.Cabin)):
    if X.Cabin.iloc[i]!=1:
        X.Cabin.iloc[i]=0

In [8]:
#Dropping unnecesarry columns
drops=["Name","PassengerId","Embarked","Ticket","Cabin"]
X=X.drop(drops,axis=1)


In [9]:
X_train = X.values[0:889]
X_test = X.values[889:]

In [10]:
np.where(np.isnan(X_test)) #You can see the NaN location coordenates

(array([152], dtype=int64), array([5], dtype=int64))

In [11]:
X_test=np.nan_to_num(X_test) #Replace those values with 0

### Scaling our data


In [12]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [13]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y, 
                                                    test_size=0.2, 
                                                    random_state=34, 
                  
                                                    stratify=y)

In [14]:
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=2, max_depth=2, random_state=4)
    gb_clf.fit(X_train, y_train)

    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb_clf.score(X_val, y_val)))

Learning rate:  0.05
Accuracy score (training): 0.807
Accuracy score (validation): 0.736
Learning rate:  0.075
Accuracy score (training): 0.821
Accuracy score (validation): 0.764
Learning rate:  0.1
Accuracy score (training): 0.827
Accuracy score (validation): 0.775
Learning rate:  0.25
Accuracy score (training): 0.838
Accuracy score (validation): 0.792
Learning rate:  0.5
Accuracy score (training): 0.840
Accuracy score (validation): 0.781
Learning rate:  0.75
Accuracy score (training): 0.844
Accuracy score (validation): 0.775
Learning rate:  1
Accuracy score (training): 0.852
Accuracy score (validation): 0.742


In [15]:
model = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=2, max_depth=2)
model.fit(X_train, y_train)


GradientBoostingClassifier(criterion='friedman_mse', init=None, learning_rate=1,
                           loss='deviance', max_depth=2, max_features=2,
                           max_leaf_nodes=None, min_impurity_decrease=0.0,
                           min_impurity_split=None, min_samples_leaf=1,
                           min_samples_split=2, min_weight_fraction_leaf=0.0,
                           n_estimators=20, n_iter_no_change=None,
                           presort='auto', random_state=None, subsample=1.0,
                           tol=0.0001, validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [16]:
predictions=model.predict(X_test)

In [17]:
df=pd.DataFrame({"PassengerId":data_test.PassengerId,"Survived":predictions})
df.to_csv(r'C:\Users\campo\Documents\GitHub\predictions\titanic.csv')