In [8]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import numpy as np
from scipy import stats
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA

In [3]:
titanic = pd.read_csv('train.csv')

# Maybe the length or the values of these 2 can be used on some way
titanic = titanic.drop(['PassengerId', 'Name', 'Ticket'],axis=1)

titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


In [4]:
titanic[(titanic.Sex == 'male') & (titanic.SibSp > 5)]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
159,0,3,male,,8,2,69.55,,S
201,0,3,male,,8,2,69.55,,S
324,0,3,male,,8,2,69.55,,S
846,0,3,male,,8,2,69.55,,S


In [5]:
titanic.loc[(titanic.Sex == 'male') & (titanic.SibSp.isin([1,2]))]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.00,1,0,7.2500,,S
13,0,3,male,39.00,1,5,31.2750,,S
34,0,1,male,28.00,1,0,82.1708,,C
35,0,1,male,42.00,1,0,52.0000,,S
46,0,3,male,,1,0,15.5000,,Q
...,...,...,...,...,...,...,...,...,...
817,0,2,male,31.00,1,1,37.0042,,C
831,1,2,male,0.83,1,1,18.7500,,S
860,0,3,male,41.00,2,0,14.1083,,S
861,0,2,male,21.00,1,0,11.5000,,S


In [6]:
titanic.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [9]:
titanic.Age.value_counts()

24.00    30
22.00    27
18.00    26
19.00    25
30.00    25
         ..
55.50     1
70.50     1
66.00     1
23.50     1
0.42      1
Name: Age, Length: 88, dtype: int64

In [11]:
mean_age = titanic.Age.mean()
titanic.Age.map(lambda x: x - mean_age)

0      -7.699118
1       8.300882
2      -3.699118
3       5.300882
4       5.300882
         ...    
886    -2.699118
887   -10.699118
888          NaN
889    -3.699118
890     2.300882
Name: Age, Length: 891, dtype: float64

In [13]:
def reduce_age(row):
    row.Age = row.Age - mean_age
    return row

titanic.apply(reduce_age, axis='columns')

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,-7.699118,1,0,7.2500,,S
1,1,1,female,8.300882,1,0,71.2833,C85,C
2,1,3,female,-3.699118,0,0,7.9250,,S
3,1,1,female,5.300882,1,0,53.1000,C123,S
4,0,3,male,5.300882,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...,...
886,0,2,male,-2.699118,0,0,13.0000,,S
887,1,1,female,-10.699118,0,0,30.0000,B42,S
888,0,3,female,,1,2,23.4500,,S
889,1,1,male,-3.699118,0,0,30.0000,C148,C


In [16]:
titanic.groupby('Parch')['Parch'].count()

Parch
0    678
1    118
2     80
3      5
4      4
5      5
6      1
Name: Parch, dtype: int64

In [28]:
def blah(*args):
    return 'blah'

family = titanic.groupby(['SibSp','Parch']).agg([len])

In [29]:
family.index

MultiIndex([(0, 0),
            (0, 1),
            (0, 2),
            (0, 3),
            (0, 4),
            (0, 5),
            (1, 0),
            (1, 1),
            (1, 2),
            (1, 3),
            (1, 4),
            (1, 5),
            (1, 6),
            (2, 0),
            (2, 1),
            (2, 2),
            (2, 3),
            (3, 0),
            (3, 1),
            (3, 2),
            (4, 1),
            (4, 2),
            (5, 2),
            (8, 2)],
           names=['SibSp', 'Parch'])

In [30]:
family.reset_index()

Unnamed: 0_level_0,SibSp,Parch,Survived,Pclass,Sex,Age,Fare,Cabin,Embarked
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,len,len,len,len,len,len,len
0,0,0,537,537,537,537.0,537.0,537,537
1,0,1,38,38,38,38.0,38.0,38,38
2,0,2,29,29,29,29.0,29.0,29,29
3,0,3,1,1,1,1.0,1.0,1,1
4,0,4,1,1,1,1.0,1.0,1,1
5,0,5,2,2,2,2.0,2.0,2,2
6,1,0,123,123,123,123.0,123.0,123,123
7,1,1,57,57,57,57.0,57.0,57,57
8,1,2,19,19,19,19.0,19.0,19,19
9,1,3,3,3,3,3.0,3.0,3,3


In [32]:
titanic.sort_values(['SibSp','Parch'], ascending=False)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
159,0,3,male,,8,2,69.55,,S
180,0,3,female,,8,2,69.55,,S
201,0,3,male,,8,2,69.55,,S
324,0,3,male,,8,2,69.55,,S
792,0,3,female,,8,2,69.55,,S
...,...,...,...,...,...,...,...,...,...
884,0,3,male,25.0,0,0,7.05,,S
886,0,2,male,27.0,0,0,13.00,,S
887,1,1,female,19.0,0,0,30.00,B42,S
889,1,1,male,26.0,0,0,30.00,C148,C


In [6]:
numbers =[i for i in range(1,200,5)]
numbers

[1,
 6,
 11,
 16,
 21,
 26,
 31,
 36,
 41,
 46,
 51,
 56,
 61,
 66,
 71,
 76,
 81,
 86,
 91,
 96,
 101,
 106,
 111,
 116,
 121,
 126,
 131,
 136,
 141,
 146,
 151,
 156,
 161,
 166,
 171,
 176,
 181,
 186,
 191,
 196]

In [11]:
pd.cut(numbers,3)

[(0.805, 66.0], (0.805, 66.0], (0.805, 66.0], (0.805, 66.0], (0.805, 66.0], ..., (131.0, 196.0], (131.0, 196.0], (131.0, 196.0], (131.0, 196.0], (131.0, 196.0]]
Length: 40
Categories (3, interval[float64]): [(0.805, 66.0] < (66.0, 131.0] < (131.0, 196.0]]

In [12]:
pd.qcut(numbers,3)

[(0.999, 66.0], (0.999, 66.0], (0.999, 66.0], (0.999, 66.0], (0.999, 66.0], ..., (131.0, 196.0], (131.0, 196.0], (131.0, 196.0], (131.0, 196.0], (131.0, 196.0]]
Length: 40
Categories (3, interval[float64]): [(0.999, 66.0] < (66.0, 131.0] < (131.0, 196.0]]