In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import neattext.functions as nfx
import plotly.express as plx
from sklearn.metrics import classification_report
import keras
from keras.layers import Embedding,Dense,LSTM,GlobalMaxPooling1D,Input
from keras.callbacks import EarlyStopping,ReduceLROnPlateau
from keras.models import Sequential
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer  # here have only keras have but we change it to tensorflow.keras
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm

In [19]:
data=pd.read_csv('Suicide_Detection.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,text,class
0,2,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,3,Am I weird I don't get affected by compliments...,non-suicide
2,4,Finally 2020 is almost over... So I can never ...,non-suicide
3,8,i need helpjust help me im crying so hard,suicide
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",suicide


In [20]:
data['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
suicide,116037
non-suicide,116037


In [21]:
data['class'].value_counts().index.values

array(['suicide', 'non-suicide'], dtype=object)

In [22]:
train_data,test_data=train_test_split(data,test_size=0.2,random_state=10)

In [23]:
train_data['class'].value_counts().index.values

array(['suicide', 'non-suicide'], dtype=object)

 **Data Visualistaion**

In [24]:
plx.bar(x=train_data['class'].value_counts().index.values,y=train_data['class'].value_counts().values,color=['Sucide','Not-Suicide'])

**Data Cleaning**

In [25]:
def clean_text(text):
    text_length=[]
    cleaned_text=[]
    for sent in tqdm(text):
        sent=sent.lower()
        sent=nfx.remove_special_characters(sent)
        sent=nfx.remove_stopwords(sent)
        text_length.append(len(sent.split()))
        cleaned_text.append(sent)
    return cleaned_text,text_length

In [26]:
cleaned_train_text,train_text_length=clean_text(train_data.text)
cleaned_test_text,test_text_length=clean_text(test_data.text)

100%|██████████| 185659/185659 [00:14<00:00, 12641.29it/s]
100%|██████████| 46415/46415 [00:04<00:00, 9991.77it/s] 


In [27]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(cleaned_train_text)

In [28]:
train_text_seq=tokenizer.texts_to_sequences(cleaned_train_text)
train_text_pad=pad_sequences(train_text_seq,maxlen=50)

test_text_seq=tokenizer.texts_to_sequences(cleaned_test_text)
test_text_pad=pad_sequences(test_text_seq,maxlen=50)

**Glove Embeddings**

In [29]:
lbl_target=LabelEncoder()
train_output=lbl_target.fit_transform(train_data['class'])
test_output=lbl_target.transform(test_data['class'])

In [30]:
# Mount Google Drive
import pickle
from google.colab import drive
drive.mount('/content/drive')

# Load the pickle file from Google Drive
with open('/content/drive/MyDrive/ColabNotebooks/glove.840B.300d.pkl', 'rb') as fp:
    glove_embedding = pickle.load(fp)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
v=len(tokenizer.word_index)

embedding_matrix=np.zeros((v+1,300), dtype=float)
for word,idx in tokenizer.word_index.items():
    embedding_vector=glove_embedding.get(word)
    if embedding_vector is not None:
        embedding_matrix[idx]=embedding_vector

In [33]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.074482  ,  0.58293003, -0.78233999, ..., -0.24984001,
        -0.096953  ,  0.66692001],
       [-0.35394999,  0.23051   , -0.62689   , ..., -0.20720001,
         0.52003002,  0.51129001],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.29547   , -0.21822999, -0.039817  , ...,  0.62642998,
         0.48798001, -0.47554001],
       [ 0.75085002, -0.35099   ,  0.37674999, ..., -0.066863  ,
         0.79632998, -0.05967   ]])

In [34]:
early_stop=EarlyStopping(patience=5)
reducelr=ReduceLROnPlateau(patience=3)

**Keras Sequential Model Construction**

In [35]:
model=Sequential()
model.add(Input(shape=(40,)))
model.add(Embedding(v+1,300,weights=[embedding_matrix],trainable=False))
model.add(LSTM(20,return_sequences=True))
model.add(GlobalMaxPooling1D())
model.add(Dense(256,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.compile(optimizer=keras.optimizers.SGD(0.1,momentum=0.09),loss='binary_crossentropy',metrics=['accuracy'])

In [36]:
model.summary()

**Model Training and Evaluation**

In [37]:
r=model.fit(train_text_pad,train_output,validation_data=(test_text_pad,test_output),
            epochs=20,batch_size=256,callbacks=[early_stop,reducelr])

Epoch 1/20
[1m726/726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 95ms/step - accuracy: 0.7996 - loss: 0.4438 - val_accuracy: 0.9007 - val_loss: 0.2496 - learning_rate: 0.1000
Epoch 2/20
[1m726/726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 96ms/step - accuracy: 0.8982 - loss: 0.2525 - val_accuracy: 0.9097 - val_loss: 0.2301 - learning_rate: 0.1000
Epoch 3/20
[1m726/726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 91ms/step - accuracy: 0.9078 - loss: 0.2320 - val_accuracy: 0.9114 - val_loss: 0.2247 - learning_rate: 0.1000
Epoch 4/20
[1m726/726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 91ms/step - accuracy: 0.9132 - loss: 0.2203 - val_accuracy: 0.9171 - val_loss: 0.2124 - learning_rate: 0.1000
Epoch 5/20
[1m726/726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 97ms/step - accuracy: 0.9195 - loss: 0.2071 - val_accuracy: 0.8992 - val_loss: 0.2551 - learning_rate: 0.1000
Epoch 6/20
[1m726/726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

In [39]:
import numpy as np
from sklearn.metrics import classification_report

print('TESTING DATA CLASSIFICATION REPORT \n \n')
# Predict probabilities and then get the class with highest probability
y_pred_test = np.argmax(model.predict(test_text_pad), axis=-1)
print(classification_report(test_output, y_pred_test,
                            target_names=lbl_target.inverse_transform([0,1])))

print('TRAINING DATA CLASSIFICATION REPORT \n \n')
# Predict probabilities and then get the class with highest probability
y_pred_train = np.argmax(model.predict(train_text_pad), axis=-1)
print(classification_report(train_output, y_pred_train,
                           target_names=lbl_target.inverse_transform([0,1])))

TESTING DATA CLASSIFICATION REPORT 
 

[1m1451/1451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 11ms/step
              precision    recall  f1-score   support

 non-suicide       0.50      1.00      0.67     23209
     suicide       0.00      0.00      0.00     23206

    accuracy                           0.50     46415
   macro avg       0.25      0.50      0.33     46415
weighted avg       0.25      0.50      0.33     46415

TRAINING DATA CLASSIFICATION REPORT 
 

[1m   1/5802[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3:02[0m 31ms/step


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



[1m5802/5802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 10ms/step



Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



              precision    recall  f1-score   support

 non-suicide       0.50      1.00      0.67     92828
     suicide       0.00      0.00      0.00     92831

    accuracy                           0.50    185659
   macro avg       0.25      0.50      0.33    185659
weighted avg       0.25      0.50      0.33    185659




Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [55]:
twt = ['i will die']
twt = tokenizer.texts_to_sequences(twt)
twt = pad_sequences(twt, maxlen=50)

prediction = model.predict(twt)[0][0]
print(prediction)

if(prediction > 0.5):
    print("Potential Suicide Post")
else:
    print("Non Suicide Post")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
0.7474995
Potential Suicide Post


In [41]:
pickle.dump(tokenizer, open('tokenizer.pkl', 'wb'))

In [42]:
model.save("model.h5")



In [43]:
token_form = pickle.load(open('tokenizer.pkl', 'rb'))

In [44]:
from keras.models import load_model

In [45]:
model_form = load_model("model.h5")



In [46]:
twt = ['Through these past years thoughts of suicide, fear, anxiety I’m so close to my limit']
twt = token_form.texts_to_sequences(twt)
twt = pad_sequences(twt, maxlen=50)


prediction = model_form.predict(twt)[0][0]
print(prediction)

if(prediction > 0.5):
    print("Potential Suicide Post")
elif (prediction == 1):
    print("Non Suicide Post")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 821ms/step
0.9530275
Potential Suicide Post
