In [1]:
import pandas as pd
import numpy as np
import gzip
import string
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from keras.models import Model
from keras.models import Sequential
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, SpatialDropout1D
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
%matplotlib inline

Using TensorFlow backend.


In [2]:
def clean(questions):
    table = str.maketrans('', '', string.punctuation)
    for i,q in enumerate(questions):
        # tokenize
        x = q.split()
        # lower case
        x = [w.lower() for w  in x]
        # remove punctuation
        x = [w.translate(table) for w in x]
        # remove single-letter stuff
        x = [w for w in x if len(w) > 1]
        # remove numbers?
        x = [w for w in x if w.isalpha()]
        
        questions[i] = ' '.join(x)

In [3]:
fname = 'questions_train.xml'
columns = ['questionType','question']

In [4]:
def getdf(fname):
    columns = ['question','questionType']
    from bs4 import BeautifulSoup
    x=BeautifulSoup(open('../'+fname).read())
    x=x.xml.findAll("thread")
    df=pd.DataFrame(columns = columns)
    for each in x:
        qType = str(each.relquestion['relq_fact_label'])
        qp1 = str(each.relqsubject.text)
        qp2 = str(each.relqbody.text)
        q = qp1 + ' ' + qp2
        df.loc[len(df)] = [q, qType]
    return df
df = getdf(fname)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1118 entries, 0 to 1117
Data columns (total 2 columns):
question        1118 non-null object
questionType    1118 non-null object
dtypes: object(2)
memory usage: 26.2+ KB


In [6]:
df.questionType.value_counts()

Opinion        563
Factual        311
Socializing    244
Name: questionType, dtype: int64

#### Undersampling

In [7]:
df_fact = df[df['questionType'] == 'Factual']
df_opi = df[df['questionType'] == 'Opinion']
df_soc = df[df['questionType'] == 'Socializing']

In [8]:
df_fact_undersample = df_fact.sample(244)
df_opi_undersample = df_opi.sample(244)

In [9]:
df_final = pd.concat([df_opi_undersample, df_fact_undersample, df_soc])

In [10]:
df = df_final
df
df.questionType.value_counts()

Opinion        244
Socializing    244
Factual        244
Name: questionType, dtype: int64

In [11]:
df = shuffle(df)

In [12]:
questions = df['question'].values.tolist()
clean(questions)
X = pd.Series(questions)
Y = df.questionType

In [13]:
le = LabelEncoder()
Y = le.fit_transform(Y)
print(Y.shape)
# Y = Y.reshape(-1,1)
# print(Y.shape)
# print(Y)
Y = to_categorical(Y, num_classes=3)
print(Y.shape)
print(Y)
le.inverse_transform([0,1,2])

(732,)
(732, 3)
[[0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 ...
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]]


array(['Factual', 'Opinion', 'Socializing'], dtype=object)

In [14]:
max_words = 1000#20000
max_len = 100#len(max(X))//2
tok = Tokenizer(num_words=max_words, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tok.fit_on_texts(X)

sequences = tok.texts_to_sequences(X)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [15]:
# X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.15, random_state=42)
X_train,X_test,Y_train,Y_test = train_test_split(sequences_matrix,Y,test_size=0.15)#, random_state=42)

Processing the data
- Tokenize and convert text to seq
- add padding all seq has same shape
- choose arbitrary max_len. many ways exist

In [16]:
# word_index = tok.word_index
# print('Found %s unique tokens.' % len(word_index))

In [17]:
print(X.shape)
print(Y.shape)

(732,)
(732, 3)


In [18]:
def RNN():    
    inputs = Input(shape=[max_len])
    layer = Embedding(max_words,50, input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(128, activation='relu')(layer)
#     layer = Activation('relu')(layer)
    layer = Dropout(0.4)(layer)
    layer = Dense(3, activation='softmax')(layer)
    layer = Activation('softmax')(layer)

    model = Model(inputs=inputs,outputs=layer)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [19]:
model = RNN()
model.summary()
print((X_train.shape, Y_train.shape, X_test.shape, Y_test.shape))
model.fit(X_train,Y_train,batch_size=128,epochs=30,
          validation_split=0.15)#, callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.001)])
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.4f}'.format(accr[0],accr[1]))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 50)           50000     
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                29440     
_________________________________________________________________
dense_1 (Dense)              (None, 128)               8320      
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 387       
_________________________________________________________________
activation_1 (Activation)    (None, 3)                 0         
Total para