In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import model_selection, preprocessing

In [2]:
data = pd.read_csv('data/data/Youtube_data.csv')
data['Text'] = data['Title']+data['Description']
data = data[['Text','Category']]

In [3]:
bag_of_words = CountVectorizer()
bag_of_words.fit(data['Text'].values.astype('U'))
txt_transform = bag_of_words.transform(data['Text'].values.astype('U'))

trainX, testX, trainY, testY = model_selection.train_test_split(txt_transform, data['Category'])
trainX = trainX.todense()
testX = testX.todense()

encoder = preprocessing.LabelEncoder()
trainY = pd.get_dummies(encoder.fit_transform(trainY))
testY = pd.get_dummies(encoder.fit_transform(testY))

In [4]:
numFeatures = trainX.shape[1]

numLabels = trainY.shape[1]

X = tf.placeholder(tf.float32, [None, numFeatures])
yGold = tf.placeholder(tf.float32, [None, numLabels])

In [5]:
W = tf.Variable(tf.zeros([16335, 6]))
b = tf.Variable(tf.zeros([6]))

In [6]:
#Randomly sample from a normal distribution with standard deviation .01

weights = tf.Variable(tf.random_normal([numFeatures,numLabels],
                                       mean=0,
                                       stddev=0.01,
                                       name="weights"))

bias = tf.Variable(tf.random_normal([1,numLabels],
                                    mean=0,
                                    stddev=0.01,
                                    name="bias"))

In [7]:
# Three-component breakdown of the Logistic Regression equation.
# Note that these feed into each other.
apply_weights_OP = tf.matmul(X, weights, name="apply_weights")
add_bias_OP = tf.add(apply_weights_OP, bias, name="add_bias") 
activation_OP = tf.nn.sigmoid(add_bias_OP, name="activation")

In [8]:
# Number of Epochs in our training
numEpochs = 700

# Defining our learning rate iterations (decay)
learningRate = tf.train.exponential_decay(learning_rate=0.0008,
                                          global_step= 1,
                                          decay_steps=trainX.shape[0],
                                          decay_rate= 0.95,
                                          staircase=True)

In [9]:
#Defining our cost function - Squared Mean Error
cost_OP = tf.nn.l2_loss(activation_OP-yGold, name="squared_error_cost")

#Defining our Gradient Descent
training_OP = tf.train.GradientDescentOptimizer(learningRate).minimize(cost_OP)

In [10]:
# Create a tensorflow session
sess = tf.Session()

# Initialize our weights and biases variables.
init_OP = tf.global_variables_initializer()

# Initialize all tensorflow variables
sess.run(init_OP)

In [11]:
# argmax(activation_OP, 1) returns the label with the most probability
# argmax(yGold, 1) is the correct label
correct_predictions_OP = tf.equal(tf.argmax(activation_OP,1),tf.argmax(yGold,1))

# If every false prediction is 0 and every true prediction is 1, the average returns us the accuracy"
accuracy_OP = tf.reduce_mean(tf.cast(correct_predictions_OP, "float"))

# Summary op for regression output
activation_summary_OP = tf.summary.histogram("output", activation_OP)

# Summary op for accuracy
accuracy_summary_OP = tf.summary.scalar("accuracy", accuracy_OP)

# Summary op for cost
cost_summary_OP = tf.summary.scalar("cost", cost_OP)

# Summary ops to check how variables (W, b) are updating after each iteration
weightSummary = tf.summary.histogram("weights", weights.eval(session=sess))
biasSummary = tf.summary.histogram("biases", bias.eval(session=sess))

# Merge all summaries
merged = tf.summary.merge([activation_summary_OP, accuracy_summary_OP, cost_summary_OP, weightSummary, biasSummary])

# Summary writer
writer = tf.summary.FileWriter("summary_logs", sess.graph)

In [12]:
# Initialize reporting variables
cost = 0
diff = 1
epoch_values = []
accuracy_values = []
cost_values = []

# Training epochs
for i in range(numEpochs):
    if i > 1 and diff < .0001:
        print("change in cost %g; convergence."%diff)
        break
    else:
        # Run training step
        step = sess.run(training_OP, feed_dict={X: trainX, yGold: trainY})
        # Report occasional stats
        if i % 10 == 0:
            # Add epoch to epoch_values
            epoch_values.append(i)
            # Generate accuracy stats on test data
            train_accuracy, newCost = sess.run([accuracy_OP, cost_OP], feed_dict={X: trainX, yGold: trainY})
            # Add accuracy to live graphing variable
            accuracy_values.append(train_accuracy)
            # Add cost to live graphing variable
            cost_values.append(newCost)
            # Re-assign values for variables
            diff = abs(newCost - cost)
            cost = newCost

            #generate print statements
            print("step %d, training accuracy %g, cost %g, change in cost %g"%(i, train_accuracy, newCost, diff))


# How well do we perform on held-out test data?
print("final accuracy on test set: %s" %str(sess.run(accuracy_OP, 
                                                     feed_dict={X: testX, 
                                                                yGold: testY})))

step 0, training accuracy 0.671492, cost 1374.55, change in cost 1374.55
step 10, training accuracy 0.842782, cost 679.86, change in cost 694.687
step 20, training accuracy 0.841978, cost 531.356, change in cost 148.504
step 30, training accuracy 0.863691, cost 464.012, change in cost 67.3445
step 40, training accuracy 0.87696, cost 411.945, change in cost 52.0668
step 50, training accuracy 0.881785, cost 382.585, change in cost 29.36
step 60, training accuracy 0.88661, cost 355.9, change in cost 26.6855
step 70, training accuracy 0.89224, cost 336.82, change in cost 19.0798
step 80, training accuracy 0.897869, cost 318.143, change in cost 18.677
step 90, training accuracy 0.901086, cost 303.352, change in cost 14.7904
step 100, training accuracy 0.903498, cost 294.486, change in cost 8.8663
step 110, training accuracy 0.905911, cost 282.803, change in cost 11.6834
step 120, training accuracy 0.908323, cost 273.924, change in cost 8.87912
step 130, training accuracy 0.909932, cost 267.