### Loading Libararies and Data

In [69]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#%matplotlib inline

In [70]:
# read train and test set
train = pd.read_csv("train_titanic.csv")
test = pd.read_csv("test_titanic.csv")

In [71]:
train.shape

(718, 12)

In [72]:
train.loc[[679]]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
679,680,1,1,"Cardeza, Mr. Thomas Drake Martinez",male,36.0,0,1,PC 17755,512.3292,B51 B53 B55,C


In [73]:
test.shape

(173, 12)

In [74]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [75]:
test.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [76]:
# combining train and test dataset
df = train.append(test, ignore_index=True)

In [77]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [78]:
df.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [79]:
# check missing values
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

The combined data has missing data in __Age__, __Cabin__, and __Embarked__ features.  

Since __Cabin__ has mostly missing values (~77%), we can simply remove it from our data. We can impute the missing values in __Age__ by median. Since __Embarked__ is a categorical, we will use mode to impute missing values.

In [80]:
# remove cabin
df.drop('Cabin', axis=1, inplace=True)

# missing data imputation
df['Age'].fillna(df['Age'].median(), inplace = True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace = True)

In [81]:
# check missing values again
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

We have successfully imputed all the missing values. 

We will also remove the *Ticket*, *Name* and *PassengerId* variables as they don't seem to have any impact on the dependent variable.

In [82]:
# remove Ticket and Name variables
df.drop(['Ticket', 'Name', 'PassengerId'], axis=1, inplace=True)

Now let's convert the categorical varialbles to numerical variables. 

In [83]:
from sklearn.preprocessing import LabelEncoder

# label enconding
label = LabelEncoder()
df['Embarked_num'] = label.fit_transform(df['Embarked'])
df['Sex_num'] = label.fit_transform(df['Sex'])

df = df.drop(['Sex', 'Embarked'], axis = 1)

### Univariate Analysis

In [84]:
df.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked_num,Sex_num
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.361582,0.523008,0.381594,32.204208,1.536476,0.647587
std,0.486592,0.836071,13.019697,1.102743,0.806057,49.693429,0.791503,0.47799
min,0.0,1.0,0.42,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,22.0,0.0,0.0,7.9104,1.0,0.0
50%,0.0,3.0,28.0,0.0,0.0,14.4542,2.0,1.0
75%,1.0,3.0,35.0,1.0,0.0,31.0,2.0,1.0
max,1.0,3.0,80.0,8.0,6.0,512.3292,2.0,1.0


In [85]:
df['Parch'].value_counts()

0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64

In [86]:
df['Pclass'].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [87]:
df['Sex_num'].value_counts()

1    577
0    314
Name: Sex_num, dtype: int64

Plot histogram of __Age__

In [88]:
df['Age'].plot.hist()

<matplotlib.axes._subplots.AxesSubplot at 0x29dc02ea048>

Plot boxplot of __Age__

In [89]:
df['Age'].plot.box()

<matplotlib.axes._subplots.AxesSubplot at 0x29dc02ea048>

Plot histogram of __Fare__

In [90]:
df['Fare'].plot.hist()

<matplotlib.axes._subplots.AxesSubplot at 0x29dc02ea048>

In [91]:
df['Fare'].plot.box()

<matplotlib.axes._subplots.AxesSubplot at 0x29dc02ea048>

### Bivariate Analysis

In [92]:
# scatter plot between Age and Fare
df.plot.scatter('Age', 'Fare')

<matplotlib.axes._subplots.AxesSubplot at 0x29dc1aac9e8>

In [93]:
# Pclass vs mean Age
df.groupby('Pclass')['Age'].mean().plot.bar()

<matplotlib.axes._subplots.AxesSubplot at 0x29dc1aac9e8>

In [94]:
# Pclass vs mean Fare
df.groupby('Pclass')['Fare'].mean().plot.bar()

<matplotlib.axes._subplots.AxesSubplot at 0x29dc1aac9e8>

Let's split __df__ back to train and test set.

In [95]:
train = df[:len(train)]
test = df[len(train):]

true_val = test['Survived']
test.drop(['Survived'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### Outlier Treatment

Rows having Fare greater than 400 are outliers. So, we will remove this from our data.

In [96]:
train = train[train['Fare']<400]

Replace the outliers in Fare with the mean. The outliers are approximately above 62.

In [97]:
train.loc[train['Fare']>62, 'Fare'] = np.mean(train['Fare'])

Similarly, we will replace the outliers present in the Age variable.

In [98]:
train.loc[train['Age']>55, 'Age'] = np.mean(train['Age'])

In [100]:
train.loc[[678]]

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked_num,Sex_num
678,0,3,43.0,1,6,46.9,2,0


In [101]:
train.index = range(len(train))

In [103]:
train.loc[[679]]

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked_num,Sex_num
679,1,1,27.0,0,0,31.343982,0,1


### Modeling

In [30]:
xtrain = train.drop('Survived', axis = 1)
ytrain = train['Survived']

In [31]:
from sklearn.linear_model import LogisticRegression

In [32]:
lreg = LogisticRegression()

In [33]:
lreg.fit(xtrain, ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [34]:
pred = lreg.predict(test)

In [35]:
pred

array([0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0], dtype=int64)

In [36]:
test

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked_num,Sex_num
718,3,28.0,0,0,15.5000,1,1
719,3,33.0,0,0,7.7750,2,1
720,2,6.0,0,1,33.0000,2,0
721,3,17.0,1,0,7.0542,2,1
722,2,34.0,0,0,13.0000,2,1
723,2,50.0,0,0,13.0000,2,1
724,1,27.0,1,0,53.1000,2,1
725,3,20.0,0,0,8.6625,2,1
726,2,30.0,3,0,21.0000,2,0
727,3,28.0,0,0,7.7375,1,0


In [37]:
true_val

718    0
719    0
720    1
721    0
722    0
723    0
724    1
725    0
726    1
727    1
728    0
729    0
730    1
731    0
732    0
733    0
734    0
735    0
736    0
737    1
738    0
739    0
740    1
741    0
742    1
743    0
744    1
745    0
746    0
747    1
      ..
861    0
862    1
863    0
864    0
865    1
866    1
867    0
868    0
869    1
870    0
871    1
872    0
873    0
874    1
875    1
876    0
877    0
878    0
879    1
880    1
881    0
882    0
883    0
884    0
885    0
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 173, dtype: int64

In [38]:
lreg.score(test, true_val)

0.80924855491329484

In [48]:
X = xtrain
y = ytrain
X.columns



Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_num', 'Sex_num'], dtype='object')

In [49]:
from sklearn.model_selection import StratifiedKFold
i = 1
kf = StratifiedKFold(n_splits=5,random_state=0,shuffle=True)

X.reindex(columns=['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_num', 'Sex_num'])

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked_num,Sex_num
0,3,22.000000,1,0,7.250000,2,1
1,1,38.000000,1,0,31.343982,0,0
2,3,26.000000,0,0,7.925000,2,0
3,1,35.000000,1,0,53.100000,2,0
4,3,35.000000,0,0,8.050000,2,1
5,3,28.000000,0,0,8.458300,1,1
6,1,54.000000,0,0,51.862500,2,1
7,3,2.000000,3,1,21.075000,2,1
8,3,27.000000,0,2,11.133300,2,0
9,2,14.000000,1,0,30.070800,0,0


In [59]:
X.shape

(716, 7)

In [63]:
X.loc[[678:680]]

SyntaxError: invalid syntax (<ipython-input-63-77ee05c65995>, line 1)

In [52]:
for train_index,test_index in kf.split(X,y):
    print('\n {} of kfold {}'.format(i,kf.n_splits))
    print('\n train index',train_index)
    print('\n test index',test_index)
    
    xtr,xcv = X.loc[train_index], X.loc[test_index]
    ytr,ycv = y[train_index], y[test_index]
    
    print('\n LOC train index',X.loc[train_index])
    print('\n LOC test index',X.loc[test_index])
    
    model = LogisticRegression(random_state=0)
    
    model.fit(xtr,ytr)
    pred_test = model.predict(xcv)
    
    #classifier = LogisticRegression()

    #classifier.fit(xtr,ytr)
    
    #pred_test = classifier.predict(xcv)
    score = accuracy_score(ycv,pred_test)
    print('accuracy_score',score)
    i+=1


 1 of kfold 5

 train index [  0   1   2   3   5   6   7   8   9  11  12  14  16  18  19  21  22  24
  25  27  28  29  32  33  34  35  36  37  38  39  41  42  43  44  45  46
  47  49  50  51  53  54  55  56  57  58  59  60  62  63  64  65  66  68
  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86
  88  89  90  92  94  96  97  98  99 102 103 104 105 106 107 109 110 111
 112 113 114 115 116 117 119 121 123 124 126 128 129 130 131 132 133 134
 135 136 137 138 139 141 142 143 144 145 146 147 148 150 151 152 153 155
 156 158 159 160 162 164 165 166 167 168 169 170 172 173 174 175 176 177
 178 179 180 181 182 184 186 187 188 189 191 193 194 195 196 197 198 199
 200 202 204 205 206 207 208 209 210 212 213 214 215 216 217 218 220 221
 222 223 224 225 226 227 228 229 230 231 232 234 236 237 239 241 245 246
 247 248 249 251 252 253 254 255 256 257 259 260 263 264 265 266 267 268
 269 270 271 273 275 276 277 278 279 280 281 282 283 284 285 286 288 289
 290 292 293 294 295 2

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#depr

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').