## Intro and Libraries

In [2]:
import pandas as pd
#import autosklearn.classification
import featuretools as ft
from featuretools.primitives import *
from featuretools.variable_types import Numeric
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the "../input/" directory.
# import os
# print(os.listdir("../input"))

## Loading Data and Investigating

In [6]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
test_target = pd.read_csv('test_labels.csv')

In [4]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [7]:
test_target.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [6]:
print(train_df.columns.values)

['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']


## Data preprocessing (Data cleaning)

it is necessary to clean the data. we will apply the code of feature cleaning taken from one of existing kernels - [Best Titanic Survival Prediction for Beginners](https://www.kaggle.com/vin1234/best-titanic-survival-prediction-for-beginners)

In [9]:
print(train_df.shape)
print(test_df.shape)

(891, 12)
(418, 11)


In [7]:
combine = train_df.append(test_df)

In [10]:
print(combine.shape)
combine.head()

(1309, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [11]:
passenger_id=test_df['PassengerId']
#combine.drop(['PassengerId'], axis=1, inplace=True)
combine = combine.drop(['Ticket', 'Cabin'], axis=1)

In [12]:
combine.Fare.fillna(combine.Fare.mean(), inplace=True)

In [13]:
combine['Sex'] = combine.Sex.apply(lambda x: 0 if x == "female" else 1)

In [14]:
combine.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,7.25,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,71.2833,C
2,3,1.0,3,"Heikkinen, Miss. Laina",0,26.0,0,0,7.925,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,53.1,S
4,5,0.0,3,"Allen, Mr. William Henry",1,35.0,0,0,8.05,S


In [15]:
for name_string in combine['Name']:
    combine['Title']=combine['Name'].str.extract('([A-Za-z]+)\.',expand=True)

In [16]:
combine.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,1,0.0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,7.25,S,Mr
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,71.2833,C,Mrs
2,3,1.0,3,"Heikkinen, Miss. Laina",0,26.0,0,0,7.925,S,Miss
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,53.1,S,Mrs
4,5,0.0,3,"Allen, Mr. William Henry",1,35.0,0,0,8.05,S,Mr


In [17]:
combine.Title.value_counts()

Mr          757
Miss        260
Mrs         197
Master       61
Rev           8
Dr            8
Col           4
Mlle          2
Major         2
Ms            2
Lady          1
Don           1
Dona          1
Sir           1
Capt          1
Mme           1
Countess      1
Jonkheer      1
Name: Title, dtype: int64

In [18]:
#replacing the rare title with more common one.
mapping = {'Mlle': 'Miss', 'Major': 'Mr', 'Col': 'Mr', 'Sir': 'Mr', 'Don': 'Mr', 'Mme': 'Miss',
          'Jonkheer': 'Mr', 'Lady': 'Mrs', 'Capt': 'Mr', 'Countess': 'Mrs', 'Ms': 'Miss', 'Dona': 'Mrs'}
combine.replace({'Title': mapping}, inplace=True)

In [19]:
combine = combine.drop(['Name'], axis=1)

In [20]:
titles=['Mr','Miss','Mrs','Master','Rev','Dr']
for title in titles:
    age_to_impute = combine.groupby('Title')['Age'].median()[titles.index(title)]
    combine.loc[(combine['Age'].isnull()) & (combine['Title'] == title), 'Age'] = age_to_impute


In [21]:
combine.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,1,0.0,3,1,22.0,1,0,7.25,S,Mr
1,2,1.0,1,0,38.0,1,0,71.2833,C,Mrs
2,3,1.0,3,0,26.0,0,0,7.925,S,Miss
3,4,1.0,1,0,35.0,1,0,53.1,S,Mrs
4,5,0.0,3,1,35.0,0,0,8.05,S,Mr


In [22]:
combine.isnull().sum()

PassengerId      0
Survived       418
Pclass           0
Sex              0
Age              0
SibSp            0
Parch            0
Fare             0
Embarked         2
Title            0
dtype: int64

In [23]:
freq_port = train_df.Embarked.dropna().mode()[0]
combine['Embarked'] = combine['Embarked'].fillna(freq_port)

In [24]:
combine.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,1,0.0,3,1,22.0,1,0,7.25,S,Mr
1,2,1.0,1,0,38.0,1,0,71.2833,C,Mrs
2,3,1.0,3,0,26.0,0,0,7.925,S,Miss
3,4,1.0,1,0,35.0,1,0,53.1,S,Mrs
4,5,0.0,3,1,35.0,0,0,8.05,S,Mr


In [25]:
combine['Embarked'] = combine['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
combine['Title'] = combine['Title'].map( {'Mr': 0, 'Mrs': 1, 'Miss': 2, 'Master': 3, 'Rev': 4, 'Dr': 5} ).astype(int)
combine.fillna(0, inplace=True)

In [26]:
combine.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,1,0.0,3,1,22.0,1,0,7.25,0,0
1,2,1.0,1,0,38.0,1,0,71.2833,1,1
2,3,1.0,3,0,26.0,0,0,7.925,0,2
3,4,1.0,1,0,35.0,1,0,53.1,0,1
4,5,0.0,3,1,35.0,0,0,8.05,0,0


In [27]:
combine.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     1309 non-null   float64
 2   Pclass       1309 non-null   int64  
 3   Sex          1309 non-null   int64  
 4   Age          1309 non-null   float64
 5   SibSp        1309 non-null   int64  
 6   Parch        1309 non-null   int64  
 7   Fare         1309 non-null   float64
 8   Embarked     1309 non-null   int32  
 9   Title        1309 non-null   int32  
dtypes: float64(3), int32(2), int64(5)
memory usage: 102.3 KB


## Perform automated feature engineering

we can proceed to "automated feature engineering". To work with "featuretools" package, we should specify our dfs "train_df" and "test_df" as entities of the entity set. The entity is just a table with a uniquely identifying col known as an "index". The "featuretools" can auto infer the var types (numeric, categorical, datetime) of the cols, it could be good idea to override this behaviour if u're not sure about conversion (int vs categorical, int vs boolean )

In [28]:
es = ft.EntitySet(id = 'titanic_data')

es = es.entity_from_dataframe(entity_id = 'combine', dataframe = combine.drop(['Survived'], axis=1), 
                              variable_types = 
                              {
                                  'Embarked': ft.variable_types.Categorical,
                                  'Sex': ft.variable_types.Boolean,
                                  'Title': ft.variable_types.Categorical
                              },
                              index = 'PassengerId')

es

#embarked,sex and title is problematic cols so we specified them.

Entityset: titanic_data
  Entities:
    combine [Rows: 1309, Columns: 9]
  Relationships:
    No relationships

Once the entity set is created, it is possible to generate new features using so called **feature primitives**. A feature primitive is an operation applied to data to create a new feature. Simple calculations can be stacked on top of each other to create complex features. Feature primitives fall into two categories:

* **Aggregation**: these functions group together child datapoints for each parent and then calculate a statistic such as mean, min, max, or standard deviation. The aggregation works across multiple tables using relationships between tables.
* **Transformation**: these functions work on one or multiple columns of a single table.

In our case we do not have different tables linked between each other. However, we can create dummy tables using "normalize_entity" function. With this way we will be able to apply both aggregation and transformation funcs to generate new features. To create such tables, we will use catg, boolean and int vars

In [29]:
es = es.normalize_entity(base_entity_id='combine', new_entity_id='Embarked', index='Embarked')
es = es.normalize_entity(base_entity_id='combine', new_entity_id='Sex', index='Sex')
es = es.normalize_entity(base_entity_id='combine', new_entity_id='Title', index='Title')
es = es.normalize_entity(base_entity_id='combine', new_entity_id='Pclass', index='Pclass')
es = es.normalize_entity(base_entity_id='combine', new_entity_id='Parch', index='Parch')
es = es.normalize_entity(base_entity_id='combine', new_entity_id='SibSp', index='SibSp')
es

#we will def new tables from dsets own cols that ft uses to create then our dset features have rship w them to be created

Entityset: titanic_data
  Entities:
    combine [Rows: 1309, Columns: 9]
    Embarked [Rows: 3, Columns: 1]
    Sex [Rows: 2, Columns: 1]
    Title [Rows: 6, Columns: 1]
    Pclass [Rows: 3, Columns: 1]
    Parch [Rows: 8, Columns: 1]
    SibSp [Rows: 7, Columns: 1]
  Relationships:
    combine.Embarked -> Embarked.Embarked
    combine.Sex -> Sex.Sex
    combine.Title -> Title.Title
    combine.Pclass -> Pclass.Pclass
    combine.Parch -> Parch.Parch
    combine.SibSp -> SibSp.SibSp

In [30]:
primitives = ft.list_primitives()
pd.options.display.max_colwidth = 100
primitives[primitives['type'] == 'aggregation'].head(primitives[primitives['type'] == 'aggregation'].shape[0])

Unnamed: 0,name,type,dask_compatible,description
0,n_most_common,aggregation,False,Determines the `n` most common elements.
1,time_since_last,aggregation,False,Calculates the time elapsed since the last datetime (default in seconds).
2,num_true,aggregation,True,Counts the number of `True` values.
3,trend,aggregation,False,Calculates the trend of a variable over time.
4,mode,aggregation,False,Determines the most commonly repeated value.
5,avg_time_between,aggregation,False,Computes the average number of seconds between consecutive events.
6,sum,aggregation,True,"Calculates the total addition, ignoring `NaN`."
7,all,aggregation,True,Calculates if all values are 'True' in a list.
8,mean,aggregation,True,Computes the average for a list of values.
9,skew,aggregation,False,Computes the extent to which a distribution differs from a normal distribution.


The most of "transformation" functions are applied to datetime or time-dependent vars. In our dset we do not have such vars. these funcs will not be used.

In [31]:
primitives[primitives['type'] == 'transform'].head(primitives[primitives['type'] == 'transform'].shape[0])

Unnamed: 0,name,type,dask_compatible,description
22,age,transform,False,Calculates the age in years as a floating point number given a
23,day,transform,True,Determines the day of the month from a datetime.
24,divide_numeric,transform,True,Element-wise division of two lists.
25,negate,transform,True,Negates a numeric value.
26,year,transform,True,Determines the year value of a datetime.
27,time_since_previous,transform,False,Compute the time since the previous entry in a list.
28,modulo_by_feature,transform,True,Return the modulo of a scalar by each element in the list.
29,equal,transform,False,Determines if values in one list are equal to another list.
30,num_words,transform,True,Determines the number of words in a string by counting the spaces.
31,is_null,transform,True,Determines if a value is null.


Now we will apply a **deep feature synthesis (dfs)** func that will generate new features by automatically applying suitable aggregations, I selected a depth of 2. Higher depth vals will stack more primitives. 

In [32]:
features, feature_names = ft.dfs(entityset = es, 
                                 target_entity = 'combine', 
                                 max_depth = 2)

This is a list of new features. For example, **"Title.SUM(combine.Age" means the sum of Age values for each unique value of Title.**

In [33]:
feature_names

[<Feature: Pclass>,
 <Feature: Age>,
 <Feature: SibSp>,
 <Feature: Parch>,
 <Feature: Fare>,
 <Feature: Embarked>,
 <Feature: Sex>,
 <Feature: Title>,
 <Feature: Embarked.SUM(combine.Age)>,
 <Feature: Embarked.SUM(combine.Fare)>,
 <Feature: Embarked.STD(combine.Age)>,
 <Feature: Embarked.STD(combine.Fare)>,
 <Feature: Embarked.MAX(combine.Age)>,
 <Feature: Embarked.MAX(combine.Fare)>,
 <Feature: Embarked.SKEW(combine.Age)>,
 <Feature: Embarked.SKEW(combine.Fare)>,
 <Feature: Embarked.MIN(combine.Age)>,
 <Feature: Embarked.MIN(combine.Fare)>,
 <Feature: Embarked.MEAN(combine.Age)>,
 <Feature: Embarked.MEAN(combine.Fare)>,
 <Feature: Embarked.COUNT(combine)>,
 <Feature: Embarked.NUM_UNIQUE(combine.Parch)>,
 <Feature: Embarked.NUM_UNIQUE(combine.Pclass)>,
 <Feature: Embarked.NUM_UNIQUE(combine.Sex)>,
 <Feature: Embarked.NUM_UNIQUE(combine.SibSp)>,
 <Feature: Embarked.NUM_UNIQUE(combine.Title)>,
 <Feature: Embarked.MODE(combine.Parch)>,
 <Feature: Embarked.MODE(combine.Pclass)>,
 <Feature:

In [34]:
len(feature_names)

146

In [35]:
features[features['Age'] == 22][["Title.SUM(combine.Age)","Age","Title"]].head()

Unnamed: 0_level_0,Title.SUM(combine.Age),Age,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,27872.5,22.0,0
61,27872.5,22.0,0
81,27872.5,22.0,0
113,27872.5,22.0,0
142,4876.59,22.0,2


By using "featuretools", we were able to **generate 146 features just in a moment**.

Featuretools creates new features from multiple tables, however, now we are facing another problem known as the "curse of dimensionality".

## Curse of dimensionality: Feature reduction and selection

To deal with the "curse of dimensionality", it's necessary to apply the feature reduction and selection, which means removing low-value features from the data. But keep in mind that feature selection can hurt the performance of ML models. I will not explain all possible approaches to deal with the "curse of dimensionality". You could use any f.selection or f.reduction model u want. I will rather concentrate on the following methods:

**Feature selection via L1 norm regularization (Lasso)**

**Collinearity**


### Collinearity

Collinearity means high intercorrelations among independent features. If we maintain such features in the mode, it might be difficult to assess the effect of independent features on target variable. Therefore we will detect these features and delete them, though applying a manual revision before removal.

In [36]:
# Threshold for removing correlated variables
threshold = 0.95

# Absolute value correlation matrix
corr_matrix = features.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper.head(50)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked,Sex,Title,Embarked.SUM(combine.Age),Embarked.SUM(combine.Fare),...,SibSp.NUM_UNIQUE(combine.Embarked),SibSp.NUM_UNIQUE(combine.Parch),SibSp.NUM_UNIQUE(combine.Pclass),SibSp.NUM_UNIQUE(combine.Sex),SibSp.NUM_UNIQUE(combine.Title),SibSp.MODE(combine.Embarked),SibSp.MODE(combine.Parch),SibSp.MODE(combine.Pclass),SibSp.MODE(combine.Sex),SibSp.MODE(combine.Title)
Pclass,,0.285575,0.060832,0.018322,0.558477,0.038875,0.124617,0.034855,0.043077,0.094318,...,0.1240303,0.2076503,0.1435907,,0.1439532,,0.1488672,,0.162338,0.1443389
Age,,,0.225064,0.164063,0.105971,0.014012,0.279681,0.444705,0.011909,0.031487,...,0.2163184,0.1544156,0.2043003,,0.2548193,,0.2372768,,0.05150223,0.249179
SibSp,,,,0.373587,0.160224,0.073461,0.109609,0.27967,0.075016,0.069716,...,0.7792276,0.4109176,0.7593949,,0.8710438,,0.8217369,,0.3515147,0.7734793
Parch,,,,,0.221522,0.095523,0.213125,0.266558,0.082324,0.101213,...,0.2781161,0.05262633,0.265065,,0.2964941,,0.2938461,,0.2488658,0.2880609
Fare,,,,,,0.061118,0.185484,0.133437,0.131368,0.010164,...,0.09761043,0.04979847,0.03105973,,0.08342515,,0.06134832,,0.1914642,0.06376974
Embarked,,,,,,,0.120423,0.112932,0.969007,0.985205,...,0.08911072,0.02800961,0.03235437,,0.05957382,,0.05682427,,0.04172955,0.05852984
Sex,,,,,,,,0.59384,0.119392,0.116768,...,0.04743528,0.08506071,0.001654746,,0.03715274,,0.0206212,,0.1868998,0.02597172
Title,,,,,,,,,0.09694,0.119927,...,0.2398869,0.1298913,0.232828,,0.2822315,,0.2661501,,0.1163707,0.2753202
Embarked.SUM(combine.Age),,,,,,,,,,0.912335,...,0.1036773,0.06112334,0.05143703,,0.07839882,,0.07677146,,0.01064903,0.07882904
Embarked.SUM(combine.Fare),,,,,,,,,,,...,0.07577095,0.004020667,0.01794163,,0.04435188,,0.04092363,,0.06177674,0.04232308


In [37]:
# Select columns with correlations above threshold
collinear_features = [column for column in upper.columns if any(upper[column] > threshold)]

print('There are %d features to remove.' % (len(collinear_features)))

There are 59 features to remove.


In [38]:
features_filtered = features.drop(columns = collinear_features)

print('The number of features that passed the collinearity threshold: ', features_filtered.shape[1])

The number of features that passed the collinearity threshold:  87


**Be aware, however, that it is not a good idea to remove features only by correlation without understanding the removal process**. Features that have very high correlation (for example, Embarked.SUM(combine.Age) and Embarked.SUM(combine.Fare)) with significant difference between may require additional investigation, manual check is necessary. this is just showing
how it could be done.

### Feature selection via L1 norm regularization (Lasso)

In [39]:
features_positive = features_filtered.loc[:, features_filtered.ge(0).all()]

train_X = features_positive[:train_df.shape[0]]
train_y = train_df['Survived']

test_X = features_positive[train_df.shape[0]:]

#l1 reg could be used as a feature selection tool (lasso, l1 regularization)

In [40]:
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(train_X, train_y)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(train_X)
X_selected_df = pd.DataFrame(X_new,
                             columns=[train_X.columns[i] for i in range(len(train_X.columns)) if model.get_support()[i]])
X_selected_df.shape

(891, 18)

In [41]:
X_selected_df.columns

Index(['Age', 'Fare', 'Embarked.STD(combine.Fare)', 'Sex.MAX(combine.Fare)',
       'Title.SUM(combine.Fare)', 'Title.MAX(combine.Fare)',
       'Title.MEAN(combine.Age)', 'Title.MEAN(combine.Fare)',
       'Pclass.SUM(combine.Age)', 'Pclass.SUM(combine.Fare)',
       'Parch.MAX(combine.Fare)', 'Parch.MIN(combine.Age)',
       'Parch.MEAN(combine.Fare)', 'SibSp.SUM(combine.Age)',
       'SibSp.SUM(combine.Fare)', 'SibSp.STD(combine.Fare)',
       'SibSp.MAX(combine.Age)', 'SibSp.MIN(combine.Fare)'],
      dtype='object')

## Train/Test

There are couple steps i skipped such as crossval, or learning curve check for optimal C and # of obs for best tr sample size

In [42]:
random_forest = RandomForestClassifier(n_estimators=2000,oob_score=True)
random_forest.fit(X_selected_df, train_y)

RandomForestClassifier(n_estimators=2000, oob_score=True)

In [43]:
X_selected_df.shape

(891, 18)

In [44]:
Y_pred = random_forest.predict(test_X[X_selected_df.columns])

In [45]:
print(Y_pred)

[0 0 0 1 0 0 0 0 1 0 0 0 1 0 1 1 0 1 0 1 1 1 1 1 1 0 1 1 1 0 0 0 1 0 1 1 0
 0 0 1 0 1 0 1 1 0 1 0 1 1 1 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 0 0
 1 0 0 1 0 1 1 0 0 0 0 0 1 0 1 1 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
 1 0 1 0 0 1 0 0 1 0 1 1 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 1 0 1 0 1 0 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 0 1 0 1 0
 1 1 1 0 0 1 0 0 0 1 0 0 1 0 1 1 1 1 1 1 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 1 1 1 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 1 0
 0 0 0 1 0 0 1 0 0 0 1 1 1 0 1 0 1 1 0 0 0 1 0 1 0 0 0 0 1 1 1 1 0 0 0 1 0
 0 1 0 1 1 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 1 1 0 0 1 0 1 0 0 1 0 1 0 1 0 0
 0 1 1 1 1 0 0 1 0 0 1]


In [51]:
test_target

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [54]:
from sklearn.metrics import accuracy_score
print(accuracy_score(Y_pred,test_target["Survived"]))

#it was 0.74 with manually created features w basic rf. 0.05 increase, thx to featuretools.

0.7990430622009569


Combine outputs of "featuretools" w the human domain knowledge, and use crossval in order to analyze learning curves and pick up the most efficient model.