In [3]:
# #######################################################################
# Importing necessary libraries for creating model

from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from sklearn.model_selection import train_test_split   # For test train data spliting
import pandas as pd
import numpy as np
import keras

In [5]:
# #######################################################################
# If we need to download data from any websites

print('Beginning file download with urllib2...')

url = 'http://datax.kennesaw.edu/imdb_wiki/wiki5.csv'  
urllib.request.urlretrieve(url, '/home/sshuser2/wiki.csv')

Beginning file download with urllib2...


('/home/sshuser2/wiki.csv', <http.client.HTTPMessage at 0x7f22d389ee10>)

In [2]:
data = pd.read_csv('/home/sshuser2/imdb.csv')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,gender,age,px0,px1,px2,px3,px4,px5,px6,...,px9990,px9991,px9992,px9993,px9994,px9995,px9996,px9997,px9998,px9999
0,0,0,49,16,15,17,18,15,15,15,...,11,23,193,132,11,5,12,12,14,14
1,1,0,81,47,48,49,47,49,51,51,...,33,33,33,33,33,33,33,33,33,32
2,2,0,81,19,19,19,19,19,19,19,...,8,8,8,7,6,5,8,4,227,217
3,3,0,80,0,0,0,0,0,0,0,...,36,59,39,19,9,27,25,24,24,24
4,4,1,37,184,191,199,201,195,197,198,...,230,228,226,226,224,221,217,217,211,205


In [4]:
# #######################################################################
# Separeting data and label and removing pressure on processor

df_x = data.iloc[:,3:].values.reshape(len(data),100,100,1)

y = data.iloc[:,1].values

del data

In [5]:
df_y = keras.utils.to_categorical(y,num_classes=2)

In [6]:
df_x = np.array(df_x)
df_y = np.array(df_y)

In [8]:
df_x.shape

(33147, 100, 100, 1)

In [7]:
# #######################################################################
# Split into a training set and a test set using a stratified k fold

x_train, x_test, y_train, y_test = train_test_split(df_x,df_y,test_size=0.2,random_state=4)

del df_x
del df_y

print(x_train.shape)
print(x_test.shape)

(26517, 100, 100, 1)
(6630, 100, 100, 1)


In [9]:
# #############################################################################
# Creating a Convolutional Neural Network (CNN) MODEL 

model = Sequential()
model.add(Convolution2D(64,3,data_format='channels_last',activation='relu',input_shape=(100,100,1)))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Convolution2D(32,3,data_format='channels_last',activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Convolution2D(16,3,data_format='channels_last',activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Convolution2D(8,3,data_format='channels_last',activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Convolution2D(4,3,data_format='channels_last',activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Flatten())
model.add(Dense(100))
model.add(Activation('sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(2))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',optimizer = 'adadelta', metrics = ['accuracy'])

In [94]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_11 (Conv2D)           (None, 98, 98, 64)        640       
_________________________________________________________________
max_pooling2d_11 (MaxPooling (None, 49, 49, 64)        0         
_________________________________________________________________
conv2d_12 (Conv2D)           (None, 47, 47, 32)        18464     
_________________________________________________________________
max_pooling2d_12 (MaxPooling (None, 23, 23, 32)        0         
_________________________________________________________________
conv2d_13 (Conv2D)           (None, 21, 21, 16)        4624      
_________________________________________________________________
max_pooling2d_13 (MaxPooling (None, 10, 10, 16)        0         
_________________________________________________________________
conv2d_14 (Conv2D)           (None, 8, 8, 8)           1160      
__________

In [10]:
# #############################################################################
# Doing k fold cross validation for k = 5 and 20 times

v = model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=20, batch_size=200)

Instructions for updating:
Use tf.cast instead.
Train on 26517 samples, validate on 6630 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [11]:
model.evaluate(x_test,y_test)



[0.11390666331151897, 0.9633484162895928]

In [12]:
# #############################################################################
# Predicting gender and reporting the precision, recall F1-score

from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict_classes(x_test)
con_mat = confusion_matrix(np.argmax(y_test,axis=1),y_pred)
print(classification_report(np.argmax(y_test,axis=1),y_pred))

              precision    recall  f1-score   support

           0       0.95      0.98      0.97      3805
           1       0.98      0.94      0.96      2825

    accuracy                           0.96      6630
   macro avg       0.97      0.96      0.96      6630
weighted avg       0.96      0.96      0.96      6630



In [16]:
accuracy = (sum(con_mat.diagonal()))/(con_mat.sum())       # Calculating the accuracy

print("Total accuracy is:")
print('{percent:.2%}'.format(percent=accuracy))

Total accuracy is:
96.33%


In [17]:
# #############################################################################
# Reporting the 90% CI for k fold cross validation

from scipy.stats import sem, t
from scipy import mean

ac = v.history['acc']
confidence = 0.90
n = len(ac)
m = mean(ac)
std_err = sem(ac)
h = std_err * t.ppf((1 + confidence) / 2, n - 1)
print("%.3f%% (+/- %.3f%%)" % (np.mean(ac)*100, h*100))

94.193% (+/- 2.899%)
