In [2]:
from keras.optimizers import SGD
from keras.preprocessing.text import one_hot,text_to_word_sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
from sklearn.cross_validation import train_test_split



In [4]:
import os
import pickle
import numpy as np
import re

In [5]:
import pandas as pd

In [6]:
DATA_DIRECTORY = os.path.join('deep/data')

In [7]:
print DATA_DIRECTORY

deep/data


In [8]:
with open(os.path.join(DATA_DIRECTORY,"male_blog_list.txt"),"rb") as male_file:
    male_posts= pickle.load(male_file)
with open(os.path.join(DATA_DIRECTORY,"female_blog_list.txt"),"rb") as female_file:
    female_posts = pickle.load(female_file)

In [9]:
filtered_male_posts = []
filtered_female_posts = []

for post_male in male_posts:
    if len(post_male) == 0:
        continue
    post_male = re.sub('\\n','',post_male)
    filtered_male_posts.append(post_male)

for post_female in female_posts:
    if len(post_female) == 0:
        continue
    post_female = re.sub('\\n','',post_female)
    filtered_female_posts.append(post_female)

In [10]:
all_posts = []

In [11]:
all_posts.extend(filtered_male_posts)
all_posts.extend(filtered_female_posts)

In [12]:
type(all_posts)

list

In [13]:
all_posts

["Yes i survived not eating for 24 hours, I am glad I don't live in a 3rd world country. It was fun, got to chill with folks don't get to hang out with since the start of the school year. It was fun.",
 "i'm gonna work on my around the world today and hopefully get half of the paper done. Then i am going to the 20 hour famine at my church.",
 "i promise that we won't have school tomorrow, which gives me time to work on around the world.   Everyone needs to watch the oc and chappelle's show.",
 'Just got out of school, hopefully there will be no school tomorrow so i can work on my paper and go to the famine.',
 "Hopefully it will snow. Thanks to everyone who signed up for the message board. Well i have to go to school, who knows maybe we'll get out early.",
 'got the board up, now i got to see if people will sign up.',
 "I got cut, oh well guess I'm just gonna go get a job.",
 'I am done with my chem project, first to turn in the project also, now i have to work on around the world, pla

In [12]:
len(all_posts),len(filtered_male_posts),len(filtered_female_posts)

(4842, 2595, 2247)

In [14]:
concatenate_array_rnn = np.concatenate((np.zeros(len(filtered_male_posts)),np.ones(len(filtered_female_posts))))
print concatenate_array_rnn

[0. 0. 0. ... 1. 1. 1.]


In [14]:
char_list = list(set(''.join(all_posts)))

In [15]:
char_indices = dict((c, i) for i, c in enumerate(char_list))
indices_char = dict((i, c) for i, c in enumerate(char_list))

In [16]:
label_indices = {'male':0,'female':1}
indices_label = {0:'male',1:'female'}

In [17]:
MAX_LENGTH = 0
i = 0
MAX_INDEX = 0
for i,n in enumerate(all_posts):
    
    if len(n) > MAX_LENGTH:
        MAX_LENGTH = len(n)
        MAX_INDEX = i
        
print(MAX_LENGTH,MAX_INDEX)

(38794, 227)


In [18]:
MAX_LENGTH = 5000

In [19]:
def blog_to_char_seq(blog):
    blog_chars = list(blog)
    blog_chars_indices = list(map(lambda char: char_indices[char], blog_chars))
    return sequence.pad_sequences([blog_chars_indices], maxlen=MAX_LENGTH)[0]

In [20]:
X = []
y = []

for n, l in zip(all_posts, concatenate_array_rnn):
    X.append(blog_to_char_seq(n))
    y.append(l)
    
X = np.array(X).astype(np.uint8)
y = np.array(y)

print(X.shape, y.shape)

((4842, 5000), (4842,))


In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [22]:
model = Sequential()
model.add(Embedding(len(char_list), 32, input_length=MAX_LENGTH, mask_zero=True))
model.add(LSTM(32, return_sequences=False))
model.add(Dropout(0.1))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 5000, 32)          5440      
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
_________________________________________________________________
activation_1 (Activation)    (None, 1)                 0         
Total params: 13,793
Trainable params: 13,793
Non-trainable params: 0
_________________________________________________________________


In [23]:
model.compile(loss='binary_crossentropy',optimizer='adagrad', metrics=["accuracy"])

In [24]:
model.fit(X_train,y_train,
          batch_size=32,epochs=10,
          validation_split=0.1,
          verbose=1)



Train on 3921 samples, validate on 436 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fbebb637c10>

In [25]:
model.evaluate(X_test,y_test,batch_size=32)



[0.6729200959205628, 0.5670103094012466]

In [26]:
predicted_output = model.predict(X_test,batch_size=32)
predicted_classes = model.predict_classes(X_test, batch_size=32)



In [27]:
df = pd.DataFrame(columns=['predicted','actual'])

In [28]:
df['predicted_class'] = predicted_classes.flatten()
df['predicted'] = predicted_output.flatten()

In [29]:
df['actual'] = y_test

In [31]:
result = df['actual'] - df['predicted_class']

In [30]:
print df

     predicted  actual  predicted_class
0     0.344257     1.0                0
1     0.552605     1.0                1
2     0.463881     0.0                0
3     0.792823     1.0                1
4     0.705871     1.0                1
5     0.297561     1.0                0
6     0.364133     1.0                0
7     0.494389     1.0                0
8     0.429164     1.0                0
9     0.403069     1.0                0
10    0.370316     1.0                0
11    0.350583     0.0                0
12    0.396930     1.0                0
13    0.619042     1.0                1
14    0.468343     0.0                0
15    0.506175     1.0                1
16    0.289611     0.0                0
17    0.416818     1.0                0
18    0.419328     0.0                0
19    0.376031     0.0                0
20    0.409135     1.0                0
21    0.346880     0.0                0
22    0.439552     0.0                0
23    0.577403     1.0                1
