In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np

In [2]:
data_file = 'binary.csv'

admissions = pd.read_csv(data_file)

In [3]:
admissions.head()

Unnamed: 0,admit,gre,gpa,rank
0,0,380,3.61,3
1,1,660,3.67,3
2,1,800,4.0,1
3,1,640,3.19,4
4,0,520,2.93,4


As we can see in above data that rank of college is categorical data. We need to create dummy columns with 1 hot encoding of rank.

In [4]:
dummy_ranks = pd.get_dummies(admissions['rank'], prefix = 'ranks')
data = pd.concat([admissions, dummy_ranks], axis = 1)
data.head()

Unnamed: 0,admit,gre,gpa,rank,ranks_1,ranks_2,ranks_3,ranks_4
0,0,380,3.61,3,0,0,1,0
1,1,660,3.67,3,0,0,1,0
2,1,800,4.0,1,1,0,0,0
3,1,640,3.19,4,0,0,0,1
4,0,520,2.93,4,0,0,0,1


As we can see we have now added 1 hot encoded colums of ranks in data. Lets remove the rank columun from data as it is of no use anymore.

In [5]:
data = data.drop('rank', axis = 1)
data.head()

Unnamed: 0,admit,gre,gpa,ranks_1,ranks_2,ranks_3,ranks_4
0,0,380,3.61,0,0,1,0
1,1,660,3.67,0,0,1,0
2,1,800,4.0,1,0,0,0
3,1,640,3.19,0,0,0,1
4,0,520,2.93,0,0,0,1


We will now standardize the __gre__ and__gpa__ columns so that they have zero mean and standard deviation of 1.

In [6]:
for field in ['gre', 'gpa']:
    mean, std = data[field].mean(), data[field].std()
    data.loc[:, field] = (data[field] - mean)/std

data.head()

Unnamed: 0,admit,gre,gpa,ranks_1,ranks_2,ranks_3,ranks_4
0,0,-1.798011,0.578348,0,0,1,0
1,1,0.625884,0.736008,0,0,1,0
2,1,1.837832,1.603135,1,0,0,0
3,1,0.452749,-0.525269,0,0,0,1
4,0,-0.586063,-1.208461,0,0,0,1


Lets split the data in training and testing datasets.

In [7]:
np.random.seed(42)
sample = np.random.choice(data.index, size=int(len(data)*0.9), replace=False)
data, test_data = data.ix[sample], data.drop(sample)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until


In [8]:
features, targets = data.drop('admit', axis=1), data['admit']
features_test, targets_test = test_data.drop('admit', axis=1), test_data['admit']

In [9]:
targets = np.reshape(targets, (targets.shape[0], 1))
targets_test = np.reshape(targets_test, (targets_test.shape[0], 1))

  return getattr(obj, method)(*args, **kwds)


In [11]:
tf.reset_default_graph()

n_epochs = 1000
learning_rate = 0.5
display_rate = 10

n_records, n_features = features.shape

x = tf.placeholder(tf.float32, shape = (None, n_features), name = 'x')
y = tf.placeholder(tf.float32, shape = (None, 1), name = 'y')

W = tf.Variable(tf.truncated_normal(shape = [n_features, 1]), name = 'W')

pred = tf.sigmoid(tf.matmul(x, W), name = 'pred')

targets_diff = pred - y

cost =  tf.reduce_mean(tf.square(targets_diff))
tf.summary.scalar('cost', cost)
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
train_op = optimizer.minimize(cost)

init = tf.global_variables_initializer()

summary_op = tf.summary.merge_all()

with tf.Session() as sess:
    sess.run(init)
    writer = tf.summary.FileWriter('./graphs', sess.graph)
    feed_dict = {x: features, y: targets}
    for i in range(n_epochs):
        sess.run(train_op, feed_dict = feed_dict)
        if i % display_rate == 0:
            c_cost, summary_result = sess.run([cost, summary_op], feed_dict = feed_dict)
            writer.add_summary(summary_result, i)
            print("Current Cost is {0}".format(c_cost))
            
    target_predictions = sess.run(pred, feed_dict = {x: features_test})
    accuracy = tf.reduce_mean(tf.cast(((target_predictions > 0.5) == targets_test), tf.float32))
    print("The accuracy is {0}".format(sess.run(accuracy)))
    


Current Cost is 0.3458428382873535
Current Cost is 0.33087387681007385
Current Cost is 0.3166440427303314
Current Cost is 0.30298545956611633
Current Cost is 0.2899320125579834
Current Cost is 0.2776043713092804
Current Cost is 0.2661137580871582
Current Cost is 0.255561500787735
Current Cost is 0.24604330956935883
Current Cost is 0.23762550950050354
Current Cost is 0.23033268749713898
Current Cost is 0.2241508960723877
Current Cost is 0.2190244495868683
Current Cost is 0.2148541510105133
Current Cost is 0.21150848269462585
Current Cost is 0.20884497463703156
Current Cost is 0.206728994846344
Current Cost is 0.20504476130008698
Current Cost is 0.20369777083396912
Current Cost is 0.2026139348745346
Current Cost is 0.20173576474189758
Current Cost is 0.20101897418498993
Current Cost is 0.20042991638183594
Current Cost is 0.19994235038757324
Current Cost is 0.19953608512878418
Current Cost is 0.1991954743862152
Current Cost is 0.1989082396030426
Current Cost is 0.1986645609140396
Current 