In [1]:
import os
import string
from nltk import word_tokenize
from gensim.models import KeyedVectors, Word2Vec
import numpy as np

from keras.utils import to_categorical
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Activation, Input, MaxPool2D
from keras.layers import Conv2D, GlobalAveragePooling1D, MaxPooling2D
from keras.layers import Concatenate
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold

## our imports
import pandas as pd
import re
from itertools import chain, repeat, islice
import requests

## utility functions
def pad_infinite(iterable, padding=None):
   return chain(iterable, repeat(padding))

def pad(iterable, size, padding=None):
   return islice(pad_infinite(iterable, padding), size)



In [2]:
## Using the same variable declarations and configuration options as the original study
ip_txt_file = './data/500_Reddit_users_posts_labels.csv'  # CSV file: "User", "Post", "Label"
ip_feat_file = './data/External_Features.csv'             # CSV file: "User", "Features"
limit_rows = 125   ## used to build a partial dataset

w2v_file = {'file': './out/numberbatch-en-19.08.txt.gz', 
            'is_binary': False, 
            'limit': None, 
            'cache': './out/vectors.kv', 
            'source':'https://conceptnet.s3.amazonaws.com/downloads/2019/numberbatch/numberbatch-en-19.08.txt.gz' }

op_file = './out/Result_5-Label_Classification.tsv'
severity_classes = {'Supportive': 0, 'Indicator': 1, 'Ideation': 2, 'Behavior': 3, 'Attempt': 4}

sys_params = {'emb_dim': 300,
              'max_sent_len': 1500,
              'str_padd': '@PADD',
              'cross_val': 5}

cnn_params = {'no_filters': 100,
              'kernels': [3, 4, 5],
              'channel': 1,
              'c_stride': (1, sys_params['emb_dim']),
              'pad': 'same',
              'ip_shape': (sys_params['max_sent_len'], sys_params['emb_dim'], 1),
              'c_activ': 'relu',
              'drop_rate': 0.3,
              'dense_1_unit': 128,
              'dense_2_unit': 128,
              'dense_activ': 'relu',
              'op_unit': 5,             # 5-Label classification
              'op_activ': 'softmax',
              'l_rate': 0.001,
              'loss': 'categorical_crossentropy',
              'batch': 4,
              'epoch': 50,
              'verbose': 1}

intermediate_layer = 'flat_drop'    # for extracting features from CNN

In [3]:
def get_keras_cnn_model():

    l_ip = Input(shape=(sys_params['max_sent_len'], sys_params['emb_dim'], 1), dtype='float32')
    lst_convfeat = []
    for filter in cnn_params['kernels']:
        l_conv = Conv2D(filters=cnn_params['no_filters'], kernel_size=(filter, sys_params['emb_dim']), strides=cnn_params['c_stride'],
                        padding=cnn_params['pad'], data_format='channels_last', input_shape=cnn_params['ip_shape'],
                        activation=cnn_params['c_activ'])(l_ip)
        l_pool = MaxPool2D(pool_size=(sys_params['max_sent_len'], 1))(l_conv)
        lst_convfeat.append(l_pool)
        
    l_concat = Concatenate(axis=1)(lst_convfeat)
    l_flat = Flatten()(l_concat)
    l_drop = Dropout(rate=cnn_params['drop_rate'], name='flat_drop')(l_flat)
    l_op = Dense(units=cnn_params['op_unit'], activation=cnn_params['op_activ'], name='cnn_op')(l_drop)
    
    final_model = Model(l_ip, l_op)
    final_model.compile(optimizer=Adam(learning_rate=cnn_params['l_rate']), loss=cnn_params['loss'], metrics=['accuracy'])   
    
    return final_model

def get_mlp_model(ip_dim):

    mlp_model = Sequential()

    mlp_model.add(Dense(units=cnn_params['op_unit'], activation=cnn_params['op_activ'], name='classif_op',
                            input_dim=ip_dim))
    mlp_model.compile(optimizer=Adam(learning_rate=cnn_params['l_rate']), loss=cnn_params['loss'],
                          metrics=['accuracy'])
    return mlp_model

def get_prf1_score(y_true, y_pred):
    tp, fp, fn = 0.0, 0.0, 0.0
    for i in range(len(y_pred)):
        if y_pred[i] == y_true[i]:
            tp += 1
        elif y_pred[i] > y_true[i]:
            fp += 1
        else:
            fn += 1
    if tp == 0:
        tp = 1.0
    if fp == 0:
        fp = 1.0
    if fn == 0:
        fn  = 1.0
    P = tp / (tp + fp)
    R = tp / (tp + fn)
    F = 2 * P * R / (P + R)
    return P, R, F


## Primary Dataset
The study's primary dataset is 500 rows of de-identified social media posts discussing various aspects of self-destructive behavior. Each row has a user id, the text of the post, and a label describing how the post was manually classified.      

In [4]:
df = pd.read_csv ( ip_txt_file ).loc[:limit_rows]
df.head(2)

Unnamed: 0,User,Post,Label
0,user-0,"['Its not a viable option, and youll be leavin...",Supportive
1,user-1,['It can be hard to appreciate the notion that...,Ideation


## Cleanup
The original study provided code to do the word tokenization, but it was based on an older version of python and honestly wasn't very efficient. We've cleaned this up a bit by converting to pandas and using more pythonic transformations. 

In [5]:
df

Unnamed: 0,User,Post,Label
0,user-0,"['Its not a viable option, and youll be leavin...",Supportive
1,user-1,['It can be hard to appreciate the notion that...,Ideation
2,user-2,"['Hi, so last night i was sitting on the ledge...",Behavior
3,user-3,['I tried to kill my self once and failed badl...,Attempt
4,user-4,['Hi NEM3030. What sorts of things do you enjo...,Ideation
...,...,...,...
121,user-121,"['No more ideas?', 'I dont agree with live for...",Ideation
122,user-122,['It started about two years ago. I dont feel ...,Ideation
123,user-123,['Theres a test for depression? I just went to...,Behavior
124,user-124,"['Same, pm me and we can talk.', 'Hi. Im in th...",Attempt


In [6]:
df['post_clean'] = df.Post.str.lower() # convert to lowercase
df['post_clean'] = df.post_clean.str.replace("/[^ -~]+/g","", regex=True) ## remove non-printable
df["post_clean"] = df.post_clean.str.replace('[^\w\s]','', regex=True) ## remove punctuation
df["post_clean"] = df.post_clean.apply(word_tokenize) ## tokenize
df["post_clean"] = df.post_clean[:sys_params['max_sent_len']] # limit length
df["post_clean"] = df.post_clean.apply ( lambda x : list(pad(x, sys_params['max_sent_len'], sys_params['str_padd']))) ## pad list

df['LabelCode'] = df.Label.map ( severity_classes ) ## map labels to codes

labels = np.array (df.LabelCode.values)
print ( 'labels:', labels[:5] )

posts = np.array(df.post_clean.values.tolist())
print ( 'posts: ', posts[:5] )

labels: [0 2 3 4 2]
posts:  [['its' 'not' 'a' ... '@PADD' '@PADD' '@PADD']
 ['it' 'can' 'be' ... 'my' 'studies' 'and']
 ['hi' 'so' 'last' ... '@PADD' '@PADD' '@PADD']
 ['i' 'tried' 'to' ... '@PADD' '@PADD' '@PADD']
 ['hi' 'nem3030' 'what' ... '@PADD' '@PADD' '@PADD']]


## Vectorize
Features are built by mapping word tokens to vectors of similar words. These similarities are taken from pre-built
similarity vectors. For this study, the authors used a popular open-source project called ConceptNet. From their documentation:

> ConceptNet is a freely-available semantic network, designed to help computers 
> understand the meanings of words that people use.

We've improved on the study's code by allowing these vectors to be automatically downloaded and by caching word 
vectors to significantly reduce loading time.

In [7]:
if (w2v_file['cache'] != "") & (os.path.isfile ( w2v_file['cache']) ):
    print ( "Using cached vectors." )
    w2v_model = KeyedVectors.load(w2v_file['cache'])
else:
    if not os.path.isfile ( w2v_file['file'] ):
        print ( f"Could not find {w2v_file['file']}.. attempting download from {w2v_file['source']}." )
        r = requests.get(w2v_file['source'], allow_redirects=True)
        open ( w2v_file['file'], 'wb').write ( r.content )
    
    print ( "Loading vectors... this will take a few minutes.." )
    w2v_model = KeyedVectors.load_word2vec_format(w2v_file['file'], binary=w2v_file['is_binary'], limit=w2v_file['limit'])
    if w2v_file['cache'] != "":
        w2v_model.save( w2v_file['cache'] )

w2v_model


Using cached vectors.


<gensim.models.keyedvectors.KeyedVectors at 0x2b3263f9050>

In [8]:
vocab = w2v_model.key_to_index
padding = np.zeros(sys_params['emb_dim'], dtype='float32')

vectors = []
for sentence in posts:
    vector = []
    for tok in sentence:
        if tok==sys_params['str_padd']:
            vector.append(list(padding))
        
        elif tok in vocab:
            vector.append(w2v_model[tok].astype(float).tolist())
    
        else:
            vector.append(list(padding))
    
    vectors.append(vector)   
    
x_data, y_data = np.array(vectors), np.array ( labels )
print ( x_data.shape, y_data.shape )


(126, 1500, 300) (126,)


## Save the Dataset

In [9]:
x_data_all = x_data.reshape(x_data.shape[0], x_data.shape[1], x_data.shape[2], 1)
y_data_all = labels

np.savez_compressed ( f'./data/smalldataset.npz', x=x_data_all, y=y_data_all )

## Load the Dataset

In [10]:
dataset = np.load ( './data/smalldataset.npz' )
x_data_all = dataset['x']
y_data_all = dataset['y']
print ( x_data_all.shape, y_data_all.shape )



(126, 1500, 300, 1) (126,)


## Run the Model (simple)

In [11]:
y_data_categorized = to_categorical(y_data_all, num_classes=5)

X_train, X_test, y_train, y_test = train_test_split( x_data_all , y_data_categorized, test_size=0.33, random_state=42)
print ( X_train.shape, y_train.shape )
print ( X_test.shape, y_test.shape )

cnn_model = get_keras_cnn_model()

epochs = 2 #cnn_params['epoch']
cnn_model.fit(x=X_train, y=y_train, batch_size=cnn_params['batch'], epochs=epochs, verbose=cnn_params['verbose'])

model_feat_extractor = Model(inputs=cnn_model.input, outputs=cnn_model.get_layer(intermediate_layer).output)
train_cnn_feat = np.array(model_feat_extractor.predict(X_train))
test_cnn_feat = np.array(model_feat_extractor.predict(X_test))

mlp_model = get_mlp_model(ip_dim = len(train_cnn_feat[0]))
mlp_model.fit(x=train_cnn_feat, y=y_train, batch_size=cnn_params['batch'], epochs=epochs, verbose=cnn_params['verbose'])

y_pred = mlp_model.predict(test_cnn_feat)
y_pred_am = np.argmax(y_pred, axis=-1)
y_test_am = np.argmax(y_test, axis=-1)

precision, recall, f1 = get_prf1_score(y_test_am, y_pred_am)
print ('\nPrecision: {0}\t Recall: {1}\t F1-Score: {2}'\
    .format(round(precision,3), round(recall,3), round(f1,3)))

(84, 1500, 300, 1) (84, 5)
(42, 1500, 300, 1) (42, 5)
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2

Precision: 0.519	 Recall: 0.483	 F1-Score: 0.5


## Run the Model (KFold Cross Validation)

In [12]:
epochs=5
scores = []

skf = StratifiedKFold(n_splits=sys_params['cross_val'])
skf.get_n_splits(x_data_all, y_data_all)

cnn_model = get_keras_cnn_model()

for cv_count, (train_index, test_index) in enumerate(skf.split(x_data_all, y_data_all), start=1):
    print ('\nRunning Stratified Cross Validation: {0}/{1}...'.format(cv_count, sys_params['cross_val']))

    X_train, X_test = x_data_all[train_index], x_data_all[test_index]
    y_train, y_test = y_data_all[train_index], y_data_all[test_index]
    
    y_train, y_test = to_categorical(y_train), to_categorical(y_test)

    cnn_model.fit(x=X_train, y=y_train, batch_size=cnn_params['batch'], epochs=epochs, verbose=cnn_params['verbose'])
    model_feat_extractor = Model(inputs=cnn_model.input, outputs=cnn_model.get_layer(intermediate_layer).output)
    train_cnn_feat = model_feat_extractor.predict(X_train)
    test_cnn_feat = model_feat_extractor.predict(X_test)

    mlp_model = get_mlp_model(ip_dim = len(train_cnn_feat[0]))
    mlp_model.fit(x=train_cnn_feat, y=y_train, batch_size=cnn_params['batch'], epochs=epochs, verbose=cnn_params['verbose'])
    
    y_pred = mlp_model.predict(test_cnn_feat)
    y_pred_am = np.argmax(y_pred, axis=-1)
    y_test_am = np.argmax(y_test, axis=-1)

    precision, recall, f1 = get_prf1_score(y_test_am, y_pred_am)
    print ('\nPrecision: {0}\t Recall: {1}\t F1-Score: {2}'\
        .format(round(precision,3), round(recall,3), round(f1,3)))

    scores.append({'I':cv_count, 'Precision':precision, 'Recall':recall, 'F1':f1})

sumdf = pd.DataFrame ( scores ).set_index('I')
sumdf.loc['Average'] = sumdf.mean()
display ( sumdf )

# print ('\nAverage Precision: {0}\t Recall: {1}\t F1-Score: {2}'\
#        .format(round(sumdf.P.mean(),3), round(sumdf.R.mean(),3), round(sumdf.F.mean(),3)))



Running Stratified Cross Validation: 1/5...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Precision: 0.556	 Recall: 0.556	 F1-Score: 0.556

Running Stratified Cross Validation: 2/5...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Precision: 0.789	 Recall: 0.714	 F1-Score: 0.75

Running Stratified Cross Validation: 3/5...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Precision: 0.773	 Recall: 0.85	 F1-Score: 0.81

Running Stratified Cross Validation: 4/5...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Precision: 0.96	 Recall: 0.96	 F1-Score: 0.96

Running Stratified Cross Validation: 5/5...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Precision: 0.92	 Recall: 0.958	 F1-Score: 0.939


Unnamed: 0_level_0,Precision,Recall,F1
I,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.555556,0.555556,0.555556
2,0.789474,0.714286,0.75
3,0.772727,0.85,0.809524
4,0.96,0.96,0.96
5,0.92,0.958333,0.938776
Average,0.799551,0.807635,0.802771


In [13]:

|

SyntaxError: invalid syntax (1091495881.py, line 1)