# Neural Network

### Data Directory

In [1]:
train_direct = '../training_data/train.pkl'
test_direct = '../testing_data/test.pkl'
d2v_direct = "../Doc2Vec/d2v.model2"

In [2]:
import pandas as pd
from collections import Counter as ctr
from bs4 import BeautifulSoup
import collections as c
from nltk.tokenize import word_tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
pd.options.mode.chained_assignment = None 

### Loading data

In [3]:
train = pd.read_pickle(train_direct)
train['text'] = train.msg.map(lambda x: x.get_text())  

test = pd.read_pickle(test_direct)
test = test.dropna(subset=['msgID'])
test['text'] = test.msg.map(lambda x: x.get_text())  

In [4]:
train_labeled = train.dropna(subset=['label'])
test_labeled = test.dropna(subset=['label'])

### Loading the dov2vec model

In [5]:
from gensim.models.doc2vec import Doc2Vec
#Loading trained model
vec_model= Doc2Vec.load(d2v_direct)

#Obtaining vectors and storing them in training df
train_labeled['vecs'] = train_labeled['text'].map(lambda x: vec_model.infer_vector(word_tokenize(x.lower())))
#Doing the same for test df
test_labeled['vecs'] = test_labeled['text'].map(lambda x: vec_model.infer_vector(word_tokenize(x.lower())))
train_labeled['vecs'].iloc[0] # An example how the code would look like

paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress


array([ 0.03047852, -0.05406708,  0.00684914, -0.09190172,  0.10686632,
        0.0109821 , -0.02377012, -0.08077868,  0.0688705 ,  0.104604  ,
        0.10747099,  0.00182713, -0.03931361, -0.05217262, -0.06938926,
        0.00916352,  0.00571766, -0.03961769,  0.01551475, -0.02399089],
      dtype=float32)

In [6]:
#Trained vectors
vectors = vec_model.docvecs    # Each post is turned into a vector

#### Fix discrepancy issue between training data ranks and testing data ranks

In [7]:
# Assume: Mod Squad --> Mod
#         Frequent Visitor --> Visitor
#         Post Mod --> Mod
train_labeled.loc[train_labeled['rank'] == 'Mod Squad', 'rank'] = 'Mod' 
train_labeled.loc[train_labeled['rank'] == 'Post Mod', 'rank'] = 'Mod'
train_labeled.loc[train_labeled['rank'] == 'Frequent Visitor', 'rank'] = 'Visitor'

In [8]:
#Obtaining dataframes for NN's
df_NN_train = train_labeled.drop(['msgID', 'msg', 'authorID', 'fine_grained', 'affiliation', 'text'], 1)
df_NN_test =  test_labeled.drop(['msgID', 'msg', 'authorID', 'fine_grained', 'affiliation', 'text'], 1)

In [9]:
baseline = max(c.Counter(df_NN_test['label']).values())/(len(df_NN_test['label']))

In [10]:
total = len(df_NN_train)
total

1188

In [11]:
labels = set(df_NN_train['label'])
labels

{'amber', 'crisis', 'green', 'red'}

# NN with embedding (FS1)

## Preprocessing

### Add embedings as inputs for NN

##### Training data

In [12]:
from sklearn import preprocessing
import numpy as np

le = preprocessing.LabelEncoder()
ohe = preprocessing.OneHotEncoder()
le.fit(df_NN_train.label)
y = le.transform(df_NN_train.label).reshape(-1, 1)    # Your predictions are labeles
ohe.fit(y)
y_train = ohe.transform(y)

#indexing removed
X_train = np.array([x for x in df_NN_train['vecs']])    # Using only vector of words as inputs!

X_train.shape, y_train.shape

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


((1188, 20), (1188, 4))

#####  Testing data

In [13]:
le.fit(df_NN_test.label)
y_test = le.transform(df_NN_test.label).reshape(-1, 1)    # Your predictions are labeles
ohe.fit(y_test)
y_test = ohe.transform(y_test)

#indexing removed
X_test = np.array([x for x in df_NN_test.vecs])    # Using only vector of words as inputs!
X_test.shape, y_test.shape

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


((400, 20), (400, 4))

## Define NN model with embedding

In [14]:
from keras.models import Model
from keras.layers import Input, Dense
from keras.models import Sequential
from keras import regularizers
from keras import initializers

Using TensorFlow backend.


Training. We can change the parameters

In [15]:
# define a model using Sequential

model = Sequential()
#Input layer
model.add(Dense(units=256, kernel_initializer=initializers.Constant(1),bias_initializer='zeros', activation='sigmoid', input_dim=X_train.shape[1],kernel_regularizer=regularizers.l2(0.01)))
#model.add(Dense(units=256, activation='sigmoid', input_dim=X_train.shape[1],kernel_regularizer=regularizers.l2(0.01)))
#Adding a second layer (it becomes deep learning)
#model.add(Dense(units=64, activation='relu',kernel_regularizer=regularizers.l2(0.01)))
#Output layer
model.add(Dense(units=len(labels), activation='softmax'))

#Similar to KL diversion
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(X_train,y_train, epochs=50, verbose=False)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


<keras.callbacks.History at 0x7ff0834586a0>

### Accuracy

In [16]:
score = model.evaluate(X_test, y_test, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 1.2790551805496215
Test accuracy: 0.54


In [17]:
baseline

0.54

# NN with embedding and sentiments (FS2)

## Preprocessing

### Add Sent scores to NN inputs

In [18]:
# calculating the sentiment scores
analyser = SentimentIntensityAnalyzer()
train_labeled['sent_score'] = train_labeled['text'].map(analyser.polarity_scores)
df_NN_train['sent_score'] = train_labeled['sent_score'].map(lambda x: list(x.values()))
X = np.array([x for x in df_NN_train['sent_score']])    # Using only vector of words as inputs!
X_train = np.concatenate((X_train,X), axis = 1)

# same for testing data
test_labeled['sent_score'] = test_labeled['text'].map(analyser.polarity_scores)
df_NN_test['sent_score'] = test_labeled['sent_score'].map(lambda x: list(x.values()))
X = np.array([x for x in df_NN_test['sent_score']])    # Using only vector of words as inputs!
X_test = np.concatenate((X_test,X), axis = 1)
X_train.shape, X_test.shape

((1188, 24), (400, 24))

In [19]:
train_labeled['sent_score'].iloc[0], df_NN_train['sent_score'].iloc[0]

({'neg': 0.168, 'neu': 0.714, 'pos': 0.118, 'compound': -0.6516},
 [0.168, 0.714, 0.118, -0.6516])

## Define NN model with embedding and sentiments

In [20]:
# define a model using Sequential

model = Sequential()
#Input layer
model.add(Dense(units=256, kernel_initializer=initializers.Constant(1),bias_initializer='zeros', activation='sigmoid', input_dim=X_train.shape[1],kernel_regularizer=regularizers.l2(0.01)))
#model.add(Dense(units=256, activation='sigmoid', input_dim=X_train.shape[1],kernel_regularizer=regularizers.l2(0.01)))
#Adding a second layer (it becomes deep learning)
#model.add(Dense(units=64, activation='relu',kernel_regularizer=regularizers.l2(0.01)))
#Output layer
model.add(Dense(units=len(labels), activation='softmax'))

#Similar to KL diversion
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(X_train,y_train, epochs=50, verbose=False)

<keras.callbacks.History at 0x7ff07eaa9f60>

### Accuracy

In [21]:
score = model.evaluate(X_test, y_test, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 1.1948605251312256
Test accuracy: 0.54


In [22]:
baseline

0.54

# NN with embedding, sentiments, ranks (FS3)

## Preprocessing

### Add ranks to NN inputs

In [23]:
#
# Add rank as another feature
# 
X = df_NN_train[['rank']]
X['rank'] = le.fit_transform(X['rank'])
X = ohe.fit_transform(X).toarray()
X_train = np.concatenate((X_train,X), axis = 1)

#
# Add rank as another feature
# 
X = df_NN_test[['rank']]
X['rank'] = le.fit_transform(X['rank'])
X = ohe.fit_transform(X).toarray()
X_test = np.concatenate((X_test,X), axis = 1)
X_train.shape, X_test.shape

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


((1188, 35), (400, 35))

## Define NN model with embedding, sentiments, ranks

In [24]:
# define a model using Sequential

model = Sequential()
#Input layer
model.add(Dense(units=256, kernel_initializer=initializers.Constant(1),bias_initializer='zeros', activation='sigmoid', input_dim=X_train.shape[1],kernel_regularizer=regularizers.l2(0.01)))
#model.add(Dense(units=256, activation='sigmoid', input_dim=X_train.shape[1],kernel_regularizer=regularizers.l2(0.01)))
#Adding a second layer (it becomes deep learning)
#model.add(Dense(units=64, activation='relu',kernel_regularizer=regularizers.l2(0.01)))
#Output layer
model.add(Dense(units=len(labels), activation='softmax'))

#Similar to KL diversion
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(X_train,y_train, epochs=50, verbose=False)

<keras.callbacks.History at 0x7ff07dde7518>

### Accuracy

In [25]:
score = model.evaluate(X_test, y_test, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 1.2541744899749756
Test accuracy: 0.555


In [26]:
baseline

0.54