In [25]:
import numpy as np
from numpy import array
from numpy import array
from numpy import asarray
from numpy import zeros
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# set configurations
pd.set_option('display.max_columns', 100)
sns.set_style("white")

# keras imports
import tensorflow as tf
import keras
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten
from keras.layers import GlobalMaxPooling1D, SpatialDropout1D, LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer

# model imports
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score
import pickle
import joblib

from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [2]:
model_data = pd.read_csv('../data/scheme1.csv', keep_default_na=False)

In [3]:
model_data.head(3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,title,selftext,author,num_comments,is_suicide,url,selftext_clean,title_clean,author_clean,selftext_length,title_length,megatext_clean,Clustered Labels,New Labels
0,0,0,Our most-broken and least-understood rules is ...,We understand that most people who reply immed...,SQLwitch,133,0,https://www.reddit.com/r/depression/comments/d...,understand people reply immediately op invitat...,broken least understood rule helper may invite...,sql witch,4792,144,sql witch understand people reply immediately ...,1,1
1,1,1,Regular Check-In Post,Welcome to /r/depression's check-in post - a p...,circinia,1644,0,https://www.reddit.com/r/depression/comments/e...,welcome r depression check post place take mom...,regular check post,c irc,650,21,c irc welcome r depression check post place ta...,1,1
2,2,2,I hate it so much when you try and express you...,I've been feeling really depressed and lonely ...,TheNewKiller69,8,0,https://www.reddit.com/r/depression/comments/f...,feeling really depressed lonely lately job ful...,hate much try express feeling parent turn arou...,new killer 69,1866,137,new killer 69 feeling really depressed lonely ...,0,0


In [4]:
model_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1897 entries, 0 to 1896
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        1897 non-null   int64 
 1   Unnamed: 0.1      1897 non-null   int64 
 2   title             1897 non-null   object
 3   selftext          1897 non-null   object
 4   author            1897 non-null   object
 5   num_comments      1897 non-null   int64 
 6   is_suicide        1897 non-null   int64 
 7   url               1897 non-null   object
 8   selftext_clean    1897 non-null   object
 9   title_clean       1897 non-null   object
 10  author_clean      1897 non-null   object
 11  selftext_length   1897 non-null   int64 
 12  title_length      1897 non-null   int64 
 13  megatext_clean    1897 non-null   object
 14  Clustered Labels  1897 non-null   int64 
 15  New Labels        1897 non-null   int64 
dtypes: int64(8), object(8)
memory usage: 237.2+ KB


In [5]:
model_data['is_suicide'].mean()

0.5166051660516605

In [45]:
# getting ready for training

X = model_data["selftext_clean"]
y = model_data["is_suicide"]
# train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print('Shape of data tensor:', X_train.shape)

tvec_optimised = TfidfVectorizer(max_df= 0.5, max_features=70, min_df=2, ngram_range=(1, 3),stop_words = 'english')
X_train_tvec = tvec_optimised.fit_transform(X_train).todense()
X_test_tvec = tvec_optimised.transform(X_test).todense()

Shape of data tensor: (1517,)


In [46]:
# X_train_tvec = pad_sequences(X_train_tvec, maxlen=MAX_SEQUENCE_LENGTH)
# print('Shape of data tensor:', X_train_tvec.shape)

In [49]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=70))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 70, 100)           5000000   
_________________________________________________________________
spatial_dropout1d_9 (Spatial (None, 70, 100)           0         
_________________________________________________________________
lstm_8 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 101       
Total params: 5,080,501
Trainable params: 5,080,501
Non-trainable params: 0
_________________________________________________________________
None


In [48]:
epochs = 5
batch_size = 64

history = model.fit(X_train_tvec, 
                    y_train, 
                    batch_size=32, 
                    epochs=10, 
                    verbose=1, 
                    validation_data=(X_test_tvec, y_test))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1517 samples, validate on 380 samples
Epoch 1/10
Epoch 2/10

KeyboardInterrupt: 

In [50]:
from keras.models import load_model
model = load_model("RNN.h5")
model.summary()

Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 500, 32)           160000    
_________________________________________________________________
lstm_8 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_18 (Dense)             (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________


In [51]:
scores = model.evaluate(X_test_tvec, y_test, verbose=0)
print('Test accuracy:', scores[1])

ValueError: Error when checking input: expected embedding_11_input to have shape (500,) but got array with shape (70,)