In [109]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression 
from sklearn.neighbors import KNeighborsClassifier 

In [110]:
%matplotlib inline

In [111]:
train_df = pd.read_csv('train.csv');
test_df = pd.read_csv('test.csv')

In [112]:
train_df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [113]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [114]:
train_df.describe(include=["O"])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Bostandyeff, Mr. Guentcho",male,CA. 2343,B96 B98,S
freq,1,577,7,4,644


In [115]:
train_df.Parch.head()

0    0
1    0
2    0
3    0
4    0
Name: Parch, dtype: int64

In [116]:
train_df.Parch.value_counts()

0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64

In [117]:
train_df[['Sex','Survived']].groupby(['Sex']).mean().sort_values(by='Survived',ascending=False)

Unnamed: 0_level_0,Survived
Sex,Unnamed: 1_level_1
female,0.742038
male,0.188908


In [118]:
train_df['Sex']=train_df['Sex'].map({'female':0,'male':1})
train_df['Sex'].head()

0    1
1    0
2    0
3    0
4    1
Name: Sex, dtype: int64

In [119]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S


In [120]:
train_df = train_df.drop(['Ticket','Cabin','Name'],axis=1)

In [121]:
test_df = test_df.drop(['Ticket','Cabin'],axis=1)

In [122]:
print(train_df.shape)
print(test_df.shape)

(891, 9)
(418, 9)


In [123]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null int64
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Embarked       889 non-null object
dtypes: float64(2), int64(6), object(1)
memory usage: 62.7+ KB


In [124]:
train_df['Age']=train_df['Age'].fillna(int(train_df.Age.mean()))

In [125]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null int64
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Embarked       889 non-null object
dtypes: float64(2), int64(6), object(1)
memory usage: 62.7+ KB


In [126]:
train_df['Embarked'].mode()

0    S
dtype: object

In [127]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Fare           417 non-null float64
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(3)
memory usage: 29.5+ KB


In [128]:
test_df['Age']=test_df['Age'].fillna(test_df.Age.mean())

In [129]:
test_df['Fare']=test_df['Fare'].fillna(test_df.Fare.mean())

In [130]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Fare           418 non-null float64
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(3)
memory usage: 29.5+ KB


In [131]:
train_df['Embarked']=train_df['Embarked'].fillna(train_df['Embarked'].mode()[0])

In [132]:
#data ready 
print(train_df.info())
print(test_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null int64
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Embarked       891 non-null object
dtypes: float64(2), int64(6), object(1)
memory usage: 62.7+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Fare           418 non-null float64
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(3)
memory usage: 29.5+ KB
None


In [133]:
t = [train_df,test_df]
for d in t:
    d['Embarked']=d['Embarked'].map({'S':0,'C':1,'Q':2})
train_df = t[0]
test_df = t[1]

In [134]:
test_df['Sex']=test_df['Sex'].map({'female':0,'male':1})

In [135]:
x_train = train_df.drop(['Survived','PassengerId'],axis=1)
y_train = train_df['Survived']
x_test = test_df.drop(['PassengerId','Name'],axis=1)

In [136]:
print('x shape',x_train.shape)
print('x_train',x_train.columns)
print('y train',x_test.columns)

x shape (891, 7)
x_train Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')
y train Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')


In [137]:
x_test['Sex'].value_counts()

1    266
0    152
Name: Sex, dtype: int64

In [138]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Pclass      891 non-null int64
Sex         891 non-null int64
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    891 non-null int64
dtypes: float64(2), int64(5)
memory usage: 48.8 KB


In [140]:
knn.fit(x_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [141]:
y_predict = knn.predict(x_test)

In [145]:
round(knn.score(x_train,y_train)*100,2)

80.47

In [151]:
result = pd.DataFrame({
    "PassengerId":test_df['PassengerId'],
    "Survived":y_predict
})

In [155]:
result.to_csv('knn_predict.csv',index=False)

In [156]:
from sklearn.svm import SVC

In [157]:
svc = SVC()

In [158]:
svc.fit(x_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [159]:
y_predict=svc.predict(x_test)

In [161]:
round(svc.score(x_train,y_train)*100,2)

89.11

In [162]:
results = pd.DataFrame({
    "PassengerId":test_df['PassengerId'],
    "Survived":y_predict
})

In [164]:
results.to_csv('svc_predict.csv',index=False)