In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')

test_pid = test['PassengerId']

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train.shape

(891, 12)

In [5]:
test.shape

(418, 11)

In [6]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

# Preprocessing

### Normalizing Fare

In [8]:
train["Fare"] = (train["Fare"] - train["Fare"].min()) / (train["Fare"].max() - train["Fare"].min())
train['Fare'] = train['Fare'].fillna(-999)

test["Fare"] = (test["Fare"] - test["Fare"].min()) / (test["Fare"].max() - test["Fare"].min())
test['Fare'] = test['Fare'].fillna(-999)

### Mapping Sex to int values

In [9]:
train["Sex"] = train["Sex"].factorize()[0]
test["Sex"] = test["Sex"].factorize()[0]

### Mapping Embarked to int values

In [10]:
train["Embarked"] = train["Embarked"].factorize()[0]
test["Embarked"] = test["Embarked"].factorize()[0]

### Getting titles from names

In [11]:
for i in range(len(train["Name"])): train["Name"][i] = train["Name"][i].split(',')[0]
train["Name"] = train["Name"].factorize()[0]

for i in range(len(test["Name"])): test["Name"][i] = test["Name"][i].split(',')[0]
test["Name"] = test["Name"].factorize()[0]

### Mapping Cabin to int values

In [12]:
train["Cabin"] = train["Cabin"].factorize()[0]
test["Cabin"] = test["Cabin"].factorize()[0]

### Standardizing Age

In [13]:
train["Age"] = (train["Age"] - train["Age"].mean()) / train["Age"].std()
train["Age"] = train["Age"].fillna(-999)

test["Age"] = (test["Age"] - test["Age"].mean()) / test["Age"].std()
test["Age"] = test["Age"].fillna(-999)

### Dropping columns

In [14]:
train.drop(['Ticket'],axis=1,inplace=True)
test.drop(['Ticket'],axis=1,inplace=True)

In [15]:
X = train.drop(['Survived'],axis=1)
y = train['Survived']

In [16]:
model = LogisticRegression(random_state=0, max_iter=2500)
model.fit(X,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=2500,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
LR = model.predict(test)
LR = pd.DataFrame({'PassengerId':test_pid,'Survived':LR})
LR.to_csv('LR.csv',index=False)

In [18]:
xgb = XGBClassifier()
xgb.fit(X,y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [19]:
xg = xgb.predict(test)

In [20]:
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": xg
    })

submission.to_csv('XGB.csv', index=False)

# The Notebook is a great example of how sometimes some basic techniques like logistic regression outperform complex techniques like xgboost,randomforest etc.