In [1]:
#import 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

In [2]:
#importing the training dataset
data=pd.read_csv('train.csv')


In [3]:
#Function to normalize the dataset
def feature_normalize(dataset):
    mu=np.mean(dataset,axis=0)
    sigma=np.std(dataset,axis=0)
    return (dataset-mu)/sigma

In [4]:
#Function to convert string type data to integers

def str_to_int(dataset):
    string_columns=dataset.select_dtypes(['object']).columns
    print(string_columns)
    
    for col in string_columns:
        dataset[col]=dataset.astype('category')
        
    categorical_columns=dataset.select_dtypes(['category']).columns
    dataset[categorical_columns]=dataset[categorical_columns].apply(lambda x:x.cat.codes)
    return dataset



In [5]:
def one_hot(df, cols):
    """
    @param df pandas DataFrame
    @param cols a list of columns to encode 
    @return a DataFrame with one-hot encoding
    """
    for each in cols:
        dummies = pd.get_dummies(df[each], prefix=each, drop_first=False)
        del df[each]
        df = pd.concat([df, dummies], axis=1)
    return df

In [9]:
#Delete the following columns
new_data=data.drop(labels=["Name","Ticket","Cabin","Parch"],axis=1)

In [6]:
#Preprocessing data

def preprocessing(df):
    
    #Replace the nan with mean value
    df["Age"]=df.fillna(df["Age"].mean(),inplace=True)
    
    #Onehot encoding Pcclass
    df=one_hot(df,df.loc[:,["Pclass"]].columns)
    
    #String to integer
    df=str_to_int(df)
    
    #age normalization
    df["Age"]=feature_normalize(df["Age"])
    
    #stats.describe(df).variance
    
    return df
    

In [7]:
from scipy import stats



In [10]:
df_train=preprocessing(new_data)


Index(['Sex', 'Age', 'Embarked'], dtype='object')


In [11]:
print(new_data.head(10))

   PassengerId  Survived     Sex   Age  SibSp     Fare Embarked
0            1         0    male  None      1   7.2500        S
1            2         1  female  None      1  71.2833        C
2            3         1  female  None      0   7.9250        S
3            4         1  female  None      1  53.1000        S
4            5         0    male  None      0   8.0500        S
5            6         0    male  None      0   8.4583        Q
6            7         0    male  None      0  51.8625        S
7            8         0    male  None      3  21.0750        S
8            9         1  female  None      0  11.1333        S
9           10         1  female  None      1  30.0708        C


In [12]:


features=df_train.iloc[:,2:7]
print(features.shape)

(891, 5)


In [13]:
features=df_train.iloc[:,2:].values
print(features.shape)

(891, 8)


In [14]:
labels=df_train.iloc[:,:1].values
print(labels.shape)

(891, 1)


In [15]:
#Working with test data
test_data=pd.read_csv("test.csv")

In [16]:
new_test_data=test_data.drop(labels=["Name","Ticket","Cabin","Parch"],axis=1)

In [17]:
test_features=preprocessing(new_test_data)

Index(['Sex', 'Age', 'Embarked'], dtype='object')


In [18]:
print(new_test_data.head(10))

   PassengerId     Sex   Age  SibSp     Fare Embarked
0          892    male  None      0   7.8292        Q
1          893  female  None      1   7.0000        S
2          894    male  None      0   9.6875        Q
3          895    male  None      0   8.6625        S
4          896  female  None      1  12.2875        S
5          897    male  None      0   9.2250        S
6          898  female  None      0   7.6292        Q
7          899    male  None      1  29.0000        S
8          900  female  None      0   7.2292        C
9          901    male  None      2  24.1500        S


In [19]:
#Encoding Sex data
new_test_data['Sex'] = [1 if item == 'male' else 0 for item in new_test_data['Sex']]

In [20]:
#one hot encoding Pclass
new_test_data=one_hot(new_test_data,new_test_data.loc[:,["Pclass"]].columns)

KeyError: "None of [['Pclass']] are in the [columns]"

In [21]:
Xtest=new_test_data.iloc[:,1:]

In [22]:
Xtest=Xtest.values

In [23]:
print(Xtest.shape)

(418, 5)


In [24]:
feature_count=features.shape[1]
label_count=labels.shape[1]
print(feature_count, label_count)

8 1


In [25]:
#defining the inputs 
epochs=2000
learning_rate=0.01
hidden_layers=feature_count-1
cost_history = np.empty(shape=[1],dtype=float)

X = tf.placeholder(tf.float32,[None,feature_count])
Y = tf.placeholder(tf.float32,[None,label_count])
hidden_layers = feature_count - 1

In [26]:
#defining the model

initializer=tf.contrib.layers.xavier_initializer()

h0=tf.layers.dense(X,hidden_layers,activation=tf.nn.relu,kernel_initializer=initializer) #defining the model

h1 = tf.layers.dense(h0, label_count, activation=None) #defining the output layer

cross_entropy=tf.nn.sigmoid_cross_entropy_with_logits(labels=Y, logits=h1)

cost=tf.reduce_mean(cross_entropy)

optimizer=tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

predicted=tf.nn.sigmoid(h1)
correct_pred=tf.equal(tf.round(predicted),Y)
accuracy=tf.reduce_mean(tf.cast(correct_pred,tf.float32))

In [27]:
#Running the Session

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for step in range (epochs+1):
        sess.run(optimizer,feed_dict={X:features, Y:labels})
        
        loss,_,acc=sess.run([cost ,optimizer , accuracy], feed_dict={X:features, Y:labels})
        
        cost_history=np.append(cost_history,acc)
        if step%100==0:
            print("Step: {:5}\tLoss: {:.3f}\tAcc: {:.2%}".format(step, loss, acc))
                
    #print('Test Accuracy:', sess.run([accuracy, tf.round(predicted)], feed_dict={X: test_x, Y: test_y}))

Step:     0	Loss: 73618.055	Acc: 0.11%
Step:   100	Loss: -24497564.000	Acc: 0.11%
Step:   200	Loss: -110411000.000	Acc: 0.11%
Step:   300	Loss: -252314672.000	Acc: 0.11%
Step:   400	Loss: -445811840.000	Acc: 0.11%
Step:   500	Loss: -687261952.000	Acc: 0.11%
Step:   600	Loss: -973527232.000	Acc: 0.11%
Step:   700	Loss: -1301880320.000	Acc: 0.11%
Step:   800	Loss: -1669945984.000	Acc: 0.11%
Step:   900	Loss: -2075652992.000	Acc: 0.11%
Step:  1000	Loss: -2517200128.000	Acc: 0.11%
Step:  1100	Loss: -2993018112.000	Acc: 0.11%
Step:  1200	Loss: -3501743104.000	Acc: 0.11%
Step:  1300	Loss: -4042188032.000	Acc: 0.11%
Step:  1400	Loss: -4613324800.000	Acc: 0.11%
Step:  1500	Loss: -5214258688.000	Acc: 0.11%
Step:  1600	Loss: -5844214784.000	Acc: 0.11%
Step:  1700	Loss: -6502520320.000	Acc: 0.11%
Step:  1800	Loss: -7188591104.000	Acc: 0.11%
Step:  1900	Loss: -7901918208.000	Acc: 0.11%
Step:  2000	Loss: -8642062336.000	Acc: 0.11%
