In [28]:
import pandas as pd
import numpy as np
df_train = pd.read_csv('D:/db/titanic/train.csv')
df_test = pd.read_csv('D:/db/titanic/test.csv')

In [160]:
df_train.sample(10)
df_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex              int64
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked         int64
Title            int64
Family.Size      int64
Single           int64
fare_log       float64
dtype: object

In [67]:
df_train.apply(lambda x: sum(x.isnull()),axis=0)

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
Title            0
dtype: int64

In [173]:
def fill_missing_age(dataset):
    for i in range(1,4):
        median_age=dataset[dataset["Pclass"]==i]["Age"].median()
        dataset["Age"]=dataset["Age"].fillna(median_age)
        return dataset


In [174]:
def feature_engineering(df):
    df['Title'] = df['Name'].map(lambda x: x.split(',')[1].split('.')[0].lower().strip() )
    popular_titles = ["mr", "miss", "mrs", "master", "dr", "rev"]
    df['Title'] = df['Title'].map(lambda x: x if x in popular_titles else "other")
    df['Title'] = pd.factorize( df['Title'] )[0]
    

   # missing_ages = df.groupby(['Title','Sex'])['Age'].agg([np.mean, np.median]).to_dict()['median']
   # df['Age'] = df.apply( lambda x: x['Age'] if str(x['Age']) != 'nan' else missing_ages[x['Title']], axis=1 )
    df = fill_missing_age(df)


    df['Family.Size']=df['Parch']+df['SibSp']+1

    df["Single"] = df["Family.Size"].apply(lambda x: 1 if x == 1 else 0)
    
    missing_fare = df.groupby('Pclass')['Fare'].agg([np.mean, np.median]).to_dict()['median']
    df['Fare'] = df.apply( lambda x: x['Fare'] if str(x['Fare']) != 'nan' else missing_fare[x['Pclass']], axis=1 )
    df['fare_log'] = np.log2( df['Fare'] + 1 )
    
    df['Sex'] = pd.factorize( df['Sex'] )[0]

    #df['Cabin'] =  df['Cabin'].map(lambda x: 'missing' if str(x) == 'nan' else x[0] )
    
    df['Embarked'] = pd.factorize( df['Embarked'] )[0]
    
    
    Col_To_Drop =['PassengerId','Name','Ticket','Cabin']  
    for col in Col_To_Drop:
        df = df.drop(col,1)


    return df

In [175]:
print(df_train.shape)
df_train.Age.describe()

(891, 16)


count    891.000000
mean      29.650789
std       13.782236
min        0.420000
25%       21.000000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [176]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier 
import random
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn import datasets, svm, cross_validation, tree, preprocessing, metrics
import sklearn.ensemble as ske
import tensorflow as tf
import tensorflow.contrib.learn as skflow
from sklearn.preprocessing import LabelEncoder

tf.set_random_seed(1)

#import xgboost as xgb

import matplotlib.pyplot as plt
%matplotlib inline

In [177]:
#df_train['Fare'].hist(bins=100);

In [178]:
#df_test['Fare'].hist(bins=100);

In [179]:
#class_sex_grouping = df_train.groupby(['Pclass','Sex']).mean()
#class_sex_grouping

In [180]:
#df_train.sample(10)

In [181]:
def get_feats(df_train):
    feats = df_train.select_dtypes(include=[np.int, np.int64, np.float]).columns.values
    black_list = ['PassengerId', 'Survived', 'Ticket','Cabin']
    
    return [feat for feat in feats if feat not in black_list]

In [182]:
def get_models():
    return [
        ('lr', LogisticRegression()),
        ('dt', DecisionTreeClassifier()),
        ('rf', RandomForestClassifier()),
        ('et', ExtraTreesClassifier()),
    ]

In [183]:
X = df_train[ get_feats(df_train) ].values
y = df_train[ 'Survived' ].values

In [184]:
train = feature_engineering(df_train)


In [185]:
train.sample(20)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Family.Size,Single,fare_log
765,1,1,1,51.0,1,0,77.9583,0,1,2,0,6.303019
880,1,2,1,25.0,0,1,26.0,0,1,2,0,4.754888
140,0,3,1,29.690958,0,2,15.2458,1,1,3,0,4.021995
713,0,3,0,29.0,0,0,9.4833,0,0,1,1,3.390021
403,0,3,0,28.0,1,0,15.85,0,0,2,0,4.074677
465,0,3,0,38.0,0,0,7.05,0,0,1,1,3.008989
95,0,3,0,30.725036,0,0,8.05,0,0,1,1,3.177918
828,1,3,0,44.470024,0,0,7.75,2,0,1,1,3.129283
852,0,3,1,9.0,1,1,15.2458,1,2,3,0,4.021995
845,0,3,0,42.0,0,0,7.55,0,0,1,1,3.095924


In [186]:
get_feats(df_train)

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked',
 'Title',
 'Family.Size',
 'Single',
 'fare_log']

In [187]:
def make_prediction(df_train, df_test, model, output_file_name):
    train = feature_engineering(df_train)
    test = feature_engineering(df_test)

    feats = get_feats(train)

    X_train = train[feats].values
    y_train = train['Survived'].values
    X_test = test[feats]

    print(model)
    model.fit(X_train, y_train)

    test['Survived'] = model.predict(X_test)
    test[ ['PassengerId', 'Survived'] ].to_csv('D:/db/titanic/' + output_file_name, index=False)

In [188]:
make_prediction(df_train, df_test, LogisticRegression(), 'linear.csv')

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


KeyError: "['PassengerId'] not in index"

In [None]:
#df_test.isnull().any()

In [189]:
#df_train.isnull().any()

In [190]:
model = RandomForestClassifier(max_depth-4, n_estimators=20, min_samples_leaf=8, random_state=2018)

NameError: name 'max_depth' is not defined

In [191]:
make_prediction(df_train, df_test, model, 'rf_md4_ne20_sl8_fe.csv')

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


KeyError: "['PassengerId'] not in index"

In [192]:
model = tree.DecisionTreeClassifier(max_depth=10)


In [193]:
make_prediction(df_train, df_test, model, 'decisiontree.csv')

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


KeyError: "['PassengerId'] not in index"

In [194]:
model = ske.GradientBoostingClassifier(n_estimators=50)


In [195]:
make_prediction(df_train, df_test, model, 'gradientboost.csv')

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)


KeyError: "['PassengerId'] not in index"

In [196]:
#categorical_column = tf.feature_column.categorical_column_with_vocabulary_list(key="Sex", vocabulary_list=["male", "female"], default_value=0)



In [197]:
train_data = feature_engineering(df_train)
test_data = feature_engineering(df_test)
test_data =test_data.drop('Survived',1)

ValueError: labels ['Survived'] not contained in axis

In [198]:
#Columns we want to turn into features for tensorflow
Columns = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Single','Family.Size','Embarked', 'fare_log']

#Column with who survive
Label = "Survived"

In [199]:
def input_fn(data): 
    # goes through each column/feature we specified and saved the corrisponding data into a dictionary
    #where the features are the key and the variables are the combination of all the columns in the pandas DataFrame
    Continuous_cols = {k: tf.constant(data[k].values) for k in Columns}
    
    # the information for each person who survived or died
    label = tf.constant(data[Label].values)
    return dict(Continuous_cols.items()),label

In [200]:
def eval_input_fn(features, labels, batch_size):
    """An input function for evaluation or prediction"""
    features=dict(features)
    if labels is None:
        inputs = features
    else:
        inputs = (features, labels)

    dataset = tf.data.Dataset.from_tensor_slices(inputs)
    
    assert batch_size is not None, "batch_size must not be None"
    dataset = dataset.batch(batch_size)

    return dataset

In [201]:
# creates a deep neural network and trains
# on data then tests the accuracy

def DNN(train_data,test_data):
    features = []
    # holds the features in the tensorflow dataset for analysis
    for c in Columns:
        features.append(tf.contrib.layers.real_valued_column(str(c)))
   
    # creates a neural network 
    classifier = tf.contrib.learn.DNNClassifier(feature_columns=features,
                                                hidden_units =[10,10,20],
                                               n_classes=2)
    
    
    
    # train the neural network with the training data (6000 steps)
    classifier.fit(input_fn=lambda: input_fn(train_data), steps=6000)
    
    # test the data on the rest of the training input (100 steps)
    accuracy = classifier.evaluate(input_fn=lambda: input_fn(train_data), steps=100)
    
    
    #print out the accuracy
    for key in sorted(accuracy):
       print("%s: %s" % (key, accuracy[key]*100))
    print('done')
    
    
    batch_size = 100
    predictions = classifier.predict(input_fn=lambda:eval_input_fn(test_data,labels=None,batch_size=batch_size))
    
    results = list(predictions)


    for i in range(0,10):    
        print (x(results,i))

In [202]:
DNN(train_data, test_data)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x00000208FD386588>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': 'C:\\Users\\ALEKSA~1\\AppData\\Local\\Temp\\tmpt3pad424'}
Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no 

INFO:tensorflow:global_step/sec: 1031.52
INFO:tensorflow:loss = 0.378667, step = 4901 (0.097 sec)
INFO:tensorflow:global_step/sec: 1064.43
INFO:tensorflow:loss = 0.378698, step = 5001 (0.094 sec)
INFO:tensorflow:global_step/sec: 1021
INFO:tensorflow:loss = 0.378379, step = 5101 (0.097 sec)
INFO:tensorflow:global_step/sec: 1087.58
INFO:tensorflow:loss = 0.378385, step = 5201 (0.092 sec)
INFO:tensorflow:global_step/sec: 1053.24
INFO:tensorflow:loss = 0.37839, step = 5301 (0.096 sec)
INFO:tensorflow:global_step/sec: 1099.54
INFO:tensorflow:loss = 0.377854, step = 5401 (0.091 sec)
INFO:tensorflow:global_step/sec: 1099.53
INFO:tensorflow:loss = 0.378056, step = 5501 (0.090 sec)
INFO:tensorflow:global_step/sec: 1099.53
INFO:tensorflow:loss = 0.377806, step = 5601 (0.092 sec)
INFO:tensorflow:global_step/sec: 1031.52
INFO:tensorflow:loss = 0.377744, step = 5701 (0.097 sec)
INFO:tensorflow:global_step/sec: 1087.58
INFO:tensorflow:loss = 0.377364, step = 5801 (0.091 sec)
INFO:tensorflow:global_s

AttributeError: module 'tensorflow' has no attribute 'data'