In [1]:
import pandas as pd 
import numpy as np
import random as rnd

In [2]:
train_data=pd.read_csv('train.csv')
test_data=pd.read_csv('test.csv')
whole_data=[train_data,test_data]
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


### dropping the columns which we dont want

In [4]:
train_data=train_data.drop(['Cabin','Ticket','PassengerId'],axis=1)
test_data=test_data.drop(['Cabin','Ticket','PassengerId'],axis=1)
whole_data=[train_data,test_data]

### data preprocessing

In [5]:
map_titles={"Mr":1,"Miss":2,"Mrs":3,"Master":4,"Rare":5}
for dataset in whole_data:
    dataset['Title']=dataset.Name.str.extract('([A-Za-z]+)\.',expand=False)
    dataset['Title']=dataset['Title'].replace(['Lady','Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Johnkeer','Dona'],'Rare')
    dataset['Title']=dataset['Title'].replace(['Mlle','Ms',])
    dataset['Title']=dataset['Title'].replace(['Mme','Mrs'])

train_data=train_data.drop(['Name'],axis=1)
test_data=test_data.drop(['Name'],axis=1)

whole_data=[train_data,test_data]
print(train_data.shape)
print(test_data.shape)



(891, 9)
(418, 8)


In [6]:
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,male,22.0,1,0,7.25,S,Mr
1,1,1,female,38.0,1,0,71.2833,C,Mr
2,1,3,female,26.0,0,0,7.925,S,Miss
3,1,1,female,35.0,1,0,53.1,S,Miss
4,0,3,male,35.0,0,0,8.05,S,Mr


### fill the blank values in age column with the average

In [7]:
for dataset in whole_data:
    dataset['Age']=dataset['Age'].fillna(dataset['Age'].mean())

train_data['AgeBand'] = pd.cut(train_data['Age'].astype(int), 5)
train_data[['AgeBand', 'Survived']].groupby('AgeBand', as_index=False).mean()

Unnamed: 0,AgeBand,Survived
0,"(-0.08, 16.0]",0.55
1,"(16.0, 32.0]",0.344762
2,"(32.0, 48.0]",0.403226
3,"(48.0, 64.0]",0.434783
4,"(64.0, 80.0]",0.090909


### Normalize age data

In [8]:
whole_data=[train_data,test_data]
for ds in whole_data:
    ds.loc[ds["Age"] <= 16, 'Age'] = 0
    ds.loc[(ds['Age'] > 16) & (ds['Age'] <=32), 'Age']=1
    ds.loc[(ds['Age'] > 32) & (ds['Age'] <=48), 'Age']=2
    ds.loc[(ds['Age'] > 48) & (ds['Age'] <=64), 'Age']=3
    ds.loc[(ds['Age'] > 64), 'Age']=4
    ds.Age=ds.Age.astype(int)
train_data =train_data.drop('AgeBand',axis=1)

In [9]:
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,male,1,1,0,7.25,S,Mr
1,1,1,female,2,1,0,71.2833,C,Mr
2,1,3,female,1,0,0,7.925,S,Miss
3,1,1,female,2,1,0,53.1,S,Miss
4,0,3,male,2,0,0,8.05,S,Mr


### Convert SibSp and Parch into 0 and 1

In [10]:
whole_data=[train_data,test_data]
mapping= lambda x:0 if x==0 else 1
for ds in whole_data:
    ds['SibSp']=ds['SibSp'].map(mapping)
    ds['Parch']=ds['Parch'].map(mapping)

### Fill empty Embarked column value with mode 

In [11]:
mode=train_data['Embarked'].mode()[0]
whole_data=[train_data,test_data]
for ds in whole_data:
    ds['Embarked']=ds['Embarked'].fillna(mode)
train_data[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean()

Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.339009


### Normalize Embarked Column

In [12]:
mapping={'C':0,'Q':1,'S':2}
for ds in whole_data:
    ds['Embarked']=ds['Embarked'].map(mapping).astype(int)
ds.Embarked.head()

0    1
1    2
2    1
3    2
4    2
Name: Embarked, dtype: int32

### Fill missing value in Fare Attribute with median

In [13]:
for ds in whole_data:
    ds['Fare']=ds['Fare'].fillna(ds['Fare'].median())
train_data[['Fare', 'Survived']].groupby('Fare', as_index=False).mean()
train_data['FareBand'] = pd.qcut(train_data['Fare'], 4)
train_data[['FareBand', 'Survived']].groupby('FareBand', as_index=False).mean()

Unnamed: 0,FareBand,Survived
0,"(-0.001, 7.91]",0.197309
1,"(7.91, 14.454]",0.303571
2,"(14.454, 31.0]",0.454955
3,"(31.0, 512.329]",0.581081


### Normalize Fare Attribute

In [14]:
for dataset in whole_data:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
train_data = train_data.drop(['FareBand'], axis=1)

In [15]:
train_data.tail()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
886,0,2,male,1,0,0,1,2,Rare
887,1,1,female,1,0,0,2,2,Miss
888,0,3,female,1,1,1,2,2,Miss
889,1,1,male,1,0,0,2,0,Mr
890,0,3,male,1,0,0,0,1,Mr


### Normalize Gender Column

In [16]:
whole_data = [train_data, test_data]
for ds in whole_data:
    ds.loc[ds['Sex']=='male', 'Sex']=0
    ds.loc[ds['Sex']=='female', 'Sex']=1
    ds['Sex'] = ds['Sex'].astype(int)

In [17]:
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,1,1,0,0,2,Mr
1,1,1,1,2,1,0,3,0,Mr
2,1,3,1,1,0,0,1,2,Miss
3,1,1,1,2,1,0,3,2,Miss
4,0,3,0,2,0,0,1,2,Mr


In [18]:
train_data.to_csv('train_processed.csv')

In [19]:
test_data.to_csv('test_processed.csv')

## KNN algorithm

In [24]:
df=[['Survived', 'Pclass', 'Sex', 'Fare', 'Embarked']]

In [25]:
y = df['Survived'].values
X = df.drop('Survived', axis=1).values
print('Shape of target data: {}'.format(y.data.shape))
print('Shape of features data: {}'.format(X.data.shape))

TypeError: list indices must be integers or slices, not str