# Classification using Logistic Regression

## Import Native and Third Party Libraries

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split

import tensorflow as tf



## Read Dataset

In [2]:
os.chdir('/home/abhishek/Desktop/Projects/tf/yet_another_ML_tutorial/coding_exercise/')
raw_data = pd.read_csv("./CreditDataset.csv", header=None)
print "Shape of original data frame:", raw_data.shape

Shape of original data frame: (1000, 21)


In [3]:
# Get data types
print raw_data.dtypes
obj_df = raw_data.select_dtypes(include=['object']).copy()
print "Shape of object data frame:", obj_df.shape
int_df = raw_data.select_dtypes(include=['int64']).copy()
print "Shape of int64 data frame:", int_df.shape
print "Type of int data frame:", type(int_df)

# Check for null values in the columns containing categorical variables
print obj_df[obj_df.isnull().any(axis=1)]

0     object
1      int64
2     object
3     object
4      int64
5     object
6     object
7      int64
8     object
9     object
10     int64
11    object
12     int64
13    object
14    object
15     int64
16    object
17     int64
18    object
19    object
20     int64
dtype: object
Shape of object data frame: (1000, 13)
Shape of int64 data frame: (1000, 8)
Type of int data frame: <class 'pandas.core.frame.DataFrame'>
Empty DataFrame
Columns: [0, 2, 3, 5, 6, 8, 9, 11, 13, 14, 16, 18, 19]
Index: []


## Data Preprocessing
Please refer the notebook `Classification using Deep Neural Network`.

In [4]:
# One hot encoding of the columns containing categorical variables
# Label encoder
# 1. INSTANTIATE
# encode labels with value between 0 and n_classes-1.
le = preprocessing.LabelEncoder()
# FIT AND TRANSFORM. use df.apply() to apply le.fit_transform to all columns
le_obj_df = obj_df.apply(le.fit_transform)
# print raw_data.select_dtypes(include=['object']).head(5)
# print le_obj_df.head()

# One hot encoding of categorical variables
# 1. INSTANTIATE
encode_object = preprocessing.OneHotEncoder()
# 2. FIT
encode_object.fit(le_obj_df)
# 3. Transform
onehotlabels = encode_object.transform(le_obj_df).toarray()
print onehotlabels.shape
print type(onehotlabels)

# Merge the int64 data frame with the one hot labels
np_int_df = int_df.as_matrix()
print np_int_df.shape
processed_data = np.concatenate([onehotlabels, np_int_df], axis=1)
print processed_data.shape

# print processed_data[:,-1]
print processed_data.dtype

# One hot encoding of labels. Append the one hot labels in the preprocessed data after 
# removing the actual labels. This means that the preprocessed data would now have 63 
# columns. 
raw_labels = np.array(processed_data[:,-1]).astype(int)
encoded_labels = np.zeros((processed_data[:,-1].shape[0], 2))
encoded_labels[np.arange(processed_data[:,-1].shape[0]), raw_labels-1] = 1


processed_data = processed_data[:,0:61]
processed_data = np.concatenate([processed_data, encoded_labels], axis=1)
print processed_data.shape


(1000, 54)
<type 'numpy.ndarray'>
(1000, 8)
(1000, 62)
float64
(1000, 63)


## Test-Train Split

In [5]:
# Get test train split
X_train, X_test, y_train, y_test = train_test_split(processed_data[:, 0:61],
                                                    processed_data[:, 61:63],
                                                    test_size=0.3,
                                                    random_state=42)


In [6]:
print X_train.shape, X_test.shape

(700, 61) (300, 61)


In [7]:
print y_train.shape, y_test.shape

(700, 2) (300, 2)


## Mini-batch Creation

In [8]:
def random_batch(dataset, batch_size):
    sample = dataset[np.random.choice(dataset.shape[0], batch_size, replace=False),:]
    last_col_index = dataset.shape[1]-2
    x = sample[:,0:last_col_index]
    y = sample[:,last_col_index:last_col_index+2]
    return (x, y)

## Logistic Regression

In [9]:
# Parameters
training_epochs = 10000
learning_rate = 0.01
batch_size = 100
display_step = 100
false_neg_cost = 5
weighted_cost = True

# Network Parameters
num_input = X_train.shape[1] 
num_classes = y_train.shape[1]

print "Number of input:", num_input
print "Number of classes:", num_classes

# tf Graph input
x = tf.placeholder(tf.float32, [None, num_input])
y = tf.placeholder(tf.float32, [None, num_classes])

Number of input: 61
Number of classes: 2


In [10]:
# Set model weights
W = tf.Variable(tf.random_normal([num_input, num_classes]))
b = tf.Variable(tf.random_normal([num_classes]))

In [11]:
# Construct model
pred = tf.nn.softmax(tf.matmul(x, W) + b) # Softmax

if weighted_cost:
    cost = tf.reduce_mean(tf.nn.weighted_cross_entropy_with_logits(
              targets=y, logits=pred, pos_weight=false_neg_cost))
else:
    # Minimize error using cross entropy
    cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
                logits=pred, labels=y))

    # cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred + 1e-5), reduction_indices=1))
# Gradient Descent
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()

In [12]:
# Start training
with tf.Session() as sess:
    sess.run(init)

    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.
        total_batch = int(X_train.shape[0]/batch_size)
        # Loop over all batches
        for i in range(total_batch):
            batch_xs, batch_ys = random_batch(np.concatenate([X_train, y_train], axis=1),
                                              batch_size)
            # Fit training using batch data
            _, c = sess.run([optimizer, cost], feed_dict={x: batch_xs,
                                                          y: batch_ys})
            # Compute average loss
            avg_cost += c / total_batch
        # Display logs per epoch step
        if (epoch+1) % display_step == 0:
            print "Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(avg_cost)

    print "Optimization Finished!"

    # Test model
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
    # Calculate accuracy for test data
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    print "Accuracy:", accuracy.eval({x: X_test, y: y_test})

Epoch: 0100 cost= 0.666775908
Epoch: 0200 cost= 0.648204463
Epoch: 0300 cost= 0.645347340
Epoch: 0400 cost= 0.651775888
Epoch: 0500 cost= 0.662490189
Epoch: 0600 cost= 0.648918748
Epoch: 0700 cost= 0.653918760
Epoch: 0800 cost= 0.646061599
Epoch: 0900 cost= 0.640347336
Epoch: 1000 cost= 0.659633049
Epoch: 1100 cost= 0.644633046
Epoch: 1200 cost= 0.649633058
Epoch: 1300 cost= 0.662490181
Epoch: 1400 cost= 0.652490207
Epoch: 1500 cost= 0.650347326
Epoch: 1600 cost= 0.631061605
Epoch: 1700 cost= 0.650347326
Epoch: 1800 cost= 0.661061619
Epoch: 1900 cost= 0.665347346
Epoch: 2000 cost= 0.655347339
Epoch: 2100 cost= 0.651775888
Epoch: 2200 cost= 0.657490160
Epoch: 2300 cost= 0.670347333
Epoch: 2400 cost= 0.648918756
Epoch: 2500 cost= 0.651775897
Epoch: 2600 cost= 0.669633048
Epoch: 2700 cost= 0.653918718
Epoch: 2800 cost= 0.636061592
Epoch: 2900 cost= 0.661061602
Epoch: 3000 cost= 0.657490185
Epoch: 3100 cost= 0.657490194
Epoch: 3200 cost= 0.654633062
Epoch: 3300 cost= 0.641775898
Epoch: 340

## Result
Accuracy of approximately `70%` was obtained in this case. 