# Load Packages

In [187]:
#load main packages
import pandas as pd
print("pandas version: {}". format(pd.__version__))

import numpy as np
print("NumPy version: {}". format(np.__version__))

import scipy as sp 
print("SciPy version: {}". format(sp.__version__)) 


pandas version: 0.24.2
NumPy version: 1.16.2
SciPy version: 1.2.1


In [188]:
#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

#Configure Visualization Defaults
#%matplotlib inline = show plots in Jupyter Notebook browser
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8


In [189]:
import sklearn
print("scikit-learn version: {}". format(sklearn.__version__))

#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from sklearn import preprocessing
#from xgboost import XGBClassifier

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics


scikit-learn version: 0.20.3


### Load Datasets and Preview

In [190]:
# Load in the train and test datasets

train = pd.read_csv('D:/soft/anaconda3/axpavlov_files/Kaggle_Titanic/train.csv')
test = pd.read_csv('D:/soft/anaconda3/axpavlov_files/Kaggle_Titanic/test.csv')
data = pd.concat([train, test], ignore_index=True, sort = False)

In [191]:
data.columns.values

array(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype=object)

In [192]:
# data.isnull().sum()

### Data Cleaning (=Data Pre-processing)

In [193]:
data.nunique()

PassengerId    1309
Survived          2
Pclass            3
Name           1307
Sex               2
Age              98
SibSp             7
Parch             8
Ticket          929
Fare            281
Cabin           186
Embarked          3
dtype: int64

In [194]:
data.isnull().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [195]:
m = data['Age'].median()
data['Age'].fillna(m,inplace=True)
data['Age']=data['Age'].astype(int)

In [196]:
data['Embarked'].fillna(data['Embarked'].mode(), inplace = True)

In [197]:
temp = pd.get_dummies(data.Embarked, prefix="Emb", drop_first = True)

In [198]:
data = data.join(temp)

In [199]:
data.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Emb_Q,Emb_S
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S,0,1
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C,0,0
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S,0,1


In [200]:
data['Has_Cabin'] = data["Cabin"].apply(lambda x: 0 if type(x) == float else 1)

data['FamilySize'] = data['SibSp'] + data['Parch'] + 1

In [201]:
data['IsAlone'] = (data['FamilySize'] == 1).astype(int)

In [202]:
data['Sex'].replace('male','1',inplace=True)
data['Sex'].replace('female','0',inplace=True)
data['Sex']=data['Sex'].astype(int)
# другой способ: data['Sex'] = LabelEncoder().fit_transform(data['Sex'])

In [203]:
data.drop(['Name','Ticket','Cabin','Embarked'],axis=1,inplace=True)

In [204]:
data.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Emb_Q,Emb_S,Has_Cabin,FamilySize,IsAlone
0,1,0.0,3,1,22,1,0,7.25,0,1,0,2,0
1,2,1.0,1,0,38,1,0,71.2833,0,0,1,2,0


In [205]:
# train['FareBin'] = pd.qcut(train['Fare'], 4)
# train['AgeBin'] = pd.cut(train['Age'], 5)

#### Итоговое описание данных

In [206]:
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Fare', 'Emb_Q', 'Emb_S', 'Has_Cabin', 'FamilySize', 'IsAlone'],
      dtype='object')

In [207]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 13 columns):
PassengerId    1309 non-null int64
Survived       891 non-null float64
Pclass         1309 non-null int64
Sex            1309 non-null int32
Age            1309 non-null int32
SibSp          1309 non-null int64
Parch          1309 non-null int64
Fare           1308 non-null float64
Emb_Q          1309 non-null uint8
Emb_S          1309 non-null uint8
Has_Cabin      1309 non-null int64
FamilySize     1309 non-null int64
IsAlone        1309 non-null int32
dtypes: float64(2), int32(3), int64(6), uint8(2)
memory usage: 99.8 KB


In [208]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Emb_Q,Emb_S,Has_Cabin,FamilySize,IsAlone
count,1309.0,891.0,1309.0,1309.0,1309.0,1309.0,1309.0,1308.0,1309.0,1309.0,1309.0,1309.0,1309.0
mean,655.0,0.383838,2.294882,0.644003,29.484339,0.498854,0.385027,33.295479,0.093965,0.698243,0.225363,1.883881,0.603514
std,378.020061,0.486592,0.837836,0.478997,12.916721,1.041658,0.86556,51.758668,0.291891,0.459196,0.417981,1.583639,0.489354
min,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,328.0,0.0,2.0,0.0,22.0,0.0,0.0,7.8958,0.0,0.0,0.0,1.0,0.0
50%,655.0,0.0,3.0,1.0,28.0,0.0,0.0,14.4542,0.0,1.0,0.0,1.0,1.0
75%,982.0,1.0,3.0,1.0,35.0,1.0,0.0,31.275,0.0,1.0,0.0,2.0,1.0
max,1309.0,1.0,3.0,1.0,80.0,8.0,9.0,512.3292,1.0,1.0,1.0,11.0,1.0


##### Входящие переменные (данные):
- Pclass: порядковый тип, значения {1, 2, 3}
- Sex: бинарный тип, значения {0 <female>, 1 <male>}
- Age: количественный тип, целочисленные значения от 0 до 80
- SibSp: количественный тип, целочисленные значения от 0 до 8
- Parch: количественный тип, целочисленные значения от 0 до 6
- Fare: количественный тип, вещественные значения от 0 до 512.3292
- Has_Cabin: бинарный тип, значения {0 <нет>, 1 <да>}
- FamilySize: количественный тип, целочисленные значения от 1 до 11
- IsAlone: бинарный тип, значения {0 <нет>, 1 <да>}
- EmbS, EmbQ: бинарный тип, значения {0 <нет>, 1 <да>}

### SVM

The SVM algorithm offers a choice of kernel functions for performing its processing. Basically, mapping data into a higher dimensional space is called kernelling. The mathematical function used for the transformation is known as the kernel function, and can be of different types, such as:

    1.Linear
    2.Polynomial
    3.Radial basis function (RBF)
    4.Sigmoid
Each of these functions has its characteristics, its pros and cons, and its equation, but as there's no easy way of knowing which function performs best with any given dataset, we usually choose different functions in turn and compare the results. Let's just use the default, RBF (Radial Basis Function) for this lab.

In [209]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 13 columns):
PassengerId    1309 non-null int64
Survived       891 non-null float64
Pclass         1309 non-null int64
Sex            1309 non-null int32
Age            1309 non-null int32
SibSp          1309 non-null int64
Parch          1309 non-null int64
Fare           1308 non-null float64
Emb_Q          1309 non-null uint8
Emb_S          1309 non-null uint8
Has_Cabin      1309 non-null int64
FamilySize     1309 non-null int64
IsAlone        1309 non-null int32
dtypes: float64(2), int32(3), int64(6), uint8(2)
memory usage: 99.8 KB


In [210]:
from sklearn import svm
from sklearn.model_selection import train_test_split

train_data = data
train_data = train_data.dropna()
y = train_data['Survived']
X = train_data.drop(['Survived'],axis=1)

X_to_be_predicted = data[data.Survived.isnull()]
z = X_to_be_predicted['PassengerId']
X_to_be_predicted = X_to_be_predicted.drop(['Survived'], axis = 1)
X_to_be_predicted['Fare'].fillna(X_to_be_predicted['Fare'].median(), inplace = True)

In [211]:
#X = preprocessing.StandardScaler().fit(X).transform(X.astype(float))
#X_to_be_predicted = preprocessing.StandardScaler().fit(X_to_be_predicted).transform(X_to_be_predicted.astype(float))

In [212]:
X = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
X_to_be_predicted = (X_to_be_predicted - X_to_be_predicted.min(axis=0)) / (X_to_be_predicted.max(axis=0) - X_to_be_predicted.min(axis=0))

In [213]:
##SVM
clf = svm.SVC(kernel='poly',gamma='auto',max_iter=10000,degree=2, cache_size=500)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)
clf.fit(x_train, y_train)
acc = clf.score(x_test, y_test)
print(acc)

0.8044692737430168


In [214]:
result = clf.predict(X_to_be_predicted)
submission = pd.DataFrame({'PassengerId':z,'Survived':result})
submission.Survived = submission.Survived.astype(int)
submission.to_csv('D:/soft/anaconda3/axpavlov_files/Kaggle_Titanic/CVM_poly.csv',index=False)