In [52]:
import pandas as pd

train = pd.read_csv('titanic/input/train.csv')
test = pd.read_csv('titanic/input/test.csv')

print(f'train shape: {train.shape}, test shape {test.shape}')

train shape: (891, 12), test shape (418, 11)


In [53]:
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [55]:
# col = 'Embarked'
col = 'Sex'
pd.concat([train[col].value_counts(), train[col].value_counts() / train.shape[0] * 100, 
           test[col].value_counts(), test[col].value_counts() / test.shape[0]  * 100],
          axis=1, keys=['train', '%', 'test', '%'], sort=False)

Unnamed: 0,train,%,test,%.1
male,577,64.758698,266,63.636364
female,314,35.241302,152,36.363636


In [56]:
print('NaN in the data sets')
nans = pd.concat([train.isnull().sum(), test.isnull().sum()], 
                 axis=1, keys=['Train Dataset', 'Test Dataset'], sort=False)
print(nans[nans.sum(axis=1) > 0])

NaN in the data sets
          Train Dataset  Test Dataset
Age                 177          86.0
Fare                  0           1.0
Cabin               687         327.0
Embarked              2           0.0


In [58]:
# one feature vs target
# col = 'Sex'
col = 'Pclass'
target = 'Survived'
print(train[[col, target]].groupby([col], as_index=False).mean().sort_values(by=col))

   Pclass  Survived
0       1  0.629630
1       2  0.472826
2       3  0.242363


In [59]:
def fill_na_by_median(df, col):
    df[col].fillna(df[col].median(), inplace=True)

In [60]:
full_dataset = [train, test]
for df in full_dataset:
    fill_na_by_median(df, 'Age')
    fill_na_by_median(df, 'Fare')
    
    df['Embarked'].fillna('S', inplace=True)
    df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)
    df['Sex'] = df['Sex'].map({'male': 1, 'female': 2}).astype(int)    

In [61]:
for data in full_dataset:
    # classify Cabin by fare
    data['Cabin'] = data['Cabin'].fillna('X')
    data['Cabin'] = data['Cabin'].apply(lambda x: str(x)[0])
    data['Cabin'] = data['Cabin'].replace(['A', 'D', 'E', 'T'], 'M')
    data['Cabin'] = data['Cabin'].replace(['B', 'C'], 'H')
    data['Cabin'] = data['Cabin'].replace(['F', 'G'], 'L')
    data['Cabin'] = data['Cabin'].map({'X': 0, 'L': 1, 'M': 2, 'H': 3}).astype(int)

In [65]:
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,0,3,1,22.0,1,0,7.25,0,0
1,2,1,1,2,38.0,1,0,71.2833,3,1
2,3,1,3,2,26.0,0,0,7.925,0,0


In [64]:
for df in full_dataset:
    df.drop(['Name', 'Ticket'], axis=1, inplace=True)

In [66]:
y = train['Survived']
X = train.drop('Survived', axis=1)

In [67]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2019)

### Logistic regression

In [68]:
# Scaling the train and test feature set 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  """


In [69]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=21, solver='sag', 
                        max_iter=1000).fit(X_train_scaled, y_train)

In [70]:
y_pred = lr.predict(X_test_scaled)

In [72]:
from sklearn.metrics import accuracy_score 

print(f'accuracy_lr = {accuracy_score(y_test, y_pred)}')

accuracy_lr = 0.8044692737430168


### Catboost classifier

In [73]:
from catboost import Pool, CatBoostClassifier

train_dataset = Pool(data=X_train, label=y_train)
eval_dataset = Pool(data=X_test, label=y_test)

# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=50)
# Fit model
model.fit(train_dataset)
# Get predicted classes
preds_class = model.predict(eval_dataset)
# Get predicted probabilities for each class
preds_proba = model.predict_proba(eval_dataset)
# Get predicted RawFormulaVal
preds_raw = model.predict(eval_dataset, 
                          prediction_type='RawFormulaVal')

Learning rate set to 0.21958
0:	learn: 0.5697846	total: 32ms	remaining: 1.57s
1:	learn: 0.4999845	total: 68.1ms	remaining: 1.63s
2:	learn: 0.4567798	total: 105ms	remaining: 1.65s
3:	learn: 0.4376023	total: 147ms	remaining: 1.69s
4:	learn: 0.4178267	total: 190ms	remaining: 1.71s
5:	learn: 0.4141956	total: 215ms	remaining: 1.58s
6:	learn: 0.4038733	total: 260ms	remaining: 1.59s
7:	learn: 0.3929991	total: 304ms	remaining: 1.6s
8:	learn: 0.3800018	total: 351ms	remaining: 1.6s
9:	learn: 0.3745324	total: 397ms	remaining: 1.59s
10:	learn: 0.3712383	total: 469ms	remaining: 1.66s
11:	learn: 0.3661363	total: 520ms	remaining: 1.65s
12:	learn: 0.3637543	total: 596ms	remaining: 1.7s
13:	learn: 0.3587069	total: 643ms	remaining: 1.65s
14:	learn: 0.3520576	total: 701ms	remaining: 1.63s
15:	learn: 0.3482251	total: 748ms	remaining: 1.59s
16:	learn: 0.3471897	total: 769ms	remaining: 1.49s
17:	learn: 0.3453556	total: 863ms	remaining: 1.53s
18:	learn: 0.3418082	total: 906ms	remaining: 1.48s
19:	learn: 0.34

<catboost.core.CatBoostClassifier at 0x2aab4a39f518>

In [74]:
print(f'accuracy_cb = {accuracy_score(y_test, preds_class)}')

accuracy_cb = 0.8268156424581006


In [75]:
feature_importances = list(zip(model.feature_names_, model.feature_importances_))
feature_importances.sort(key = lambda x: -x[1])
feature_importances

[('Sex', 33.74447803439284),
 ('Age', 16.094310806895674),
 ('Fare', 12.383378733909316),
 ('PassengerId', 10.709370006435913),
 ('Pclass', 10.334615622135011),
 ('SibSp', 6.3898515341290025),
 ('Cabin', 5.277053018430991),
 ('Embarked', 2.935733809815777),
 ('Parch', 2.131208433855478)]

In [76]:
y_pred = model.predict(test)
submission = pd.DataFrame({
        'PassengerId': test['PassengerId'],
        'Survived': y_pred.astype('int')
    })
submission.to_csv('titanic_cb.csv', index=False)

In [78]:
y_pred_lr = lr.predict(test)
submission = pd.DataFrame({
        'PassengerId': test['PassengerId'],
        'Survived': y_pred_lr.astype('int')
    })
submission.to_csv('titanic_lr.csv', index=False)

In [None]:
!kaggle competitions submit -c titanic -f titanic_lr.csv -m "Logistic regression base"