In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
import re
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df_mh = pd.read_csv('../../mentalhealth.csv', encoding='cp1252')
df_mh['Text'] = df_mh['Text'].astype(str)
df_mh.tail()

Unnamed: 0,Text,MH
316,depress,1
317,depress,1
318,sad,1
319,sad,1
320,sad,1


In [4]:
from bs4 import BeautifulSoup
def cleanText(text):
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'\|\|\|', r' ', text) 
    text = re.sub(r'http\S+', r'<URL>', text)
    text = text.lower()
    text = text.replace('x', '')
    return text

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
porter = PorterStemmer()
def stemSentence(sentence):
    token_words=word_tokenize(sentence)
    token_words
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

df_mh['Text'] = df_mh['Text'].apply(cleanText).apply(stemSentence)
df_mh=df_mh[['Text','MH']]

In [5]:
df_mh

Unnamed: 0,Text,MH
0,my client need marit counsel,1
1,counsel,1
2,"career counsel , youth",1
3,counsel,1
4,counsel in redhil,1
...,...,...
316,depress,1
317,depress,1
318,sad,1
319,sad,1


In [6]:
train, test = train_test_split(df_mh, test_size=0.3, random_state=42)
import nltk
from nltk.corpus import stopwords
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens
train_tagged = train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['Text']), tags=[r.MH]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['Text']), tags=[r.MH]), axis=1)

In [7]:
#check
train_tagged.iloc[3]

TaggedDocument(words=['my', 'wife', 'bulli', 'me'], tags=[1])

In [8]:
train

Unnamed: 0,Text,MH
206,i am an elderli who just left the hospit and h...,0
81,elderli mental ill financi assist,1
147,buy a hous,0
39,my wife bulli me,1
222,im look for work after covid,0
...,...,...
188,i am a home buyer who need help with mortgag f...,0
71,i am a wife of a person with depress and would...,1
106,help me pleas,1
270,co-op,0


In [9]:
import multiprocessing
cores = multiprocessing.cpu_count()
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=0, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 224/224 [00:00<00:00, 247830.15it/s]


In [10]:
%%time
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 224/224 [00:00<00:00, 224712.77it/s]
100%|██████████| 224/224 [00:00<?, ?it/s]
100%|██████████| 224/224 [00:00<?, ?it/s]
100%|██████████| 224/224 [00:00<?, ?it/s]
100%|██████████| 224/224 [00:00<?, ?it/s]
100%|██████████| 224/224 [00:00<00:00, 185566.68it/s]
100%|██████████| 224/224 [00:00<00:00, 214895.72it/s]
100%|██████████| 224/224 [00:00<00:00, 224551.65it/s]
100%|██████████| 224/224 [00:00<?, ?it/s]
100%|██████████| 224/224 [00:00<00:00, 223909.46it/s]
100%|██████████| 224/224 [00:00<00:00, 558575.56it/s]
100%|██████████| 224/224 [00:00<?, ?it/s]
100%|██████████| 224/224 [00:00<00:00, 112181.98it/s]
100%|██████████| 224/224 [00:00<00:00, 71588.24it/s]
100%|██████████| 224/224 [00:00<?, ?it/s]
100%|██████████| 224/224 [00:00<00:00, 154526.99it/s]
100%|██████████| 224/224 [00:00<00:00, 224551.65it/s]
100%|██████████| 224/224 [00:00<?, ?it/s]
100%|██████████| 224/224 [00:00<00:00, 223802.79it/s]
100%|██████████| 224/224 [00:00<?, ?it/s]
100%|██████████| 224/224 [00:

Wall time: 520 ms


In [11]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words)) for doc in sents])
    return targets, regressors

In [12]:
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

from sklearn.metrics import accuracy_score, f1_score
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.6391752577319587
Testing F1 score: 0.6293915504346068


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# New model

In [13]:
model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=0, min_count=1, workers=5, alpha=0.065, min_alpha=0.065)
model_dmm.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 224/224 [00:00<00:00, 223536.54it/s]


In [14]:
%%time
for epoch in range(30):
    model_dmm.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dmm.alpha -= 0.002
    model_dmm.min_alpha = model_dmm.alpha

100%|██████████| 224/224 [00:00<00:00, 225035.71it/s]
100%|██████████| 224/224 [00:00<00:00, 223536.54it/s]
100%|██████████| 224/224 [00:00<?, ?it/s]
100%|██████████| 224/224 [00:00<00:00, 224069.66it/s]
100%|██████████| 224/224 [00:00<?, ?it/s]
100%|██████████| 224/224 [00:00<00:00, 181375.31it/s]
100%|██████████| 224/224 [00:00<?, ?it/s]
100%|██████████| 224/224 [00:00<00:00, 223217.89it/s]
100%|██████████| 224/224 [00:00<?, ?it/s]
100%|██████████| 224/224 [00:00<00:00, 224069.66it/s]
100%|██████████| 224/224 [00:00<?, ?it/s]
100%|██████████| 224/224 [00:00<?, ?it/s]
100%|██████████| 224/224 [00:00<00:00, 224497.99it/s]
100%|██████████| 224/224 [00:00<?, ?it/s]
100%|██████████| 224/224 [00:00<00:00, 224497.99it/s]
100%|██████████| 224/224 [00:00<?, ?it/s]
100%|██████████| 224/224 [00:00<00:00, 209108.41it/s]
100%|██████████| 224/224 [00:00<?, ?it/s]
100%|██████████| 224/224 [00:00<?, ?it/s]
100%|██████████| 224/224 [00:00<?, ?it/s]
100%|██████████| 224/224 [00:00<00:00, 224283.62it/s

Wall time: 834 ms





In [15]:
y_train, X_train = vec_for_learning(model_dmm, train_tagged)
y_test, X_test = vec_for_learning(model_dmm, test_tagged)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.6391752577319587
Testing F1 score: 0.6293915504346068


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm])

In [17]:
def get_vectors(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words)) for doc in sents])
    return targets, regressors

In [18]:
y_train, X_train = get_vectors(new_model, train_tagged)
y_test, X_test = get_vectors(new_model, test_tagged)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.6288659793814433
Testing F1 score: 0.6226311903524843


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
logreg.predict(X_test)

array([1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0])

In [20]:
import numpy as np

In [24]:
logreg.predict_proba(X_test[0].reshape(1,-1))

array([[0.05825969, 0.94174031]])

# TEST

In [37]:
input_text ='depress'

dft_tagged = pd.DataFrame([[input_text, 0]], columns = ['Text', 'MH']).apply(
    lambda r: TaggedDocument(words=tokenize_text(r['Text']), tags=[r.MH]), axis=1)

get_vectors(new_model, dft_tagged)[1]

logreg.predict_proba(get_vectors(new_model, dft_tagged)[1])

array([[0.07831633, 0.92168367]])

In [28]:
#To test old model
#input_text ='i am sad'

#dft_tagged = pd.DataFrame([[input_text, 0]], columns = ['Text', 'MH']).apply(
#    lambda r: TaggedDocument(words=tokenize_text(r['Text']), tags=[r.MH]), axis=1)

#vec_for_learning(model_dbow, dft_tagged)[1]

#logreg.predict_proba(vec_for_learning(model_dbow, dft_tagged)[1])

In [None]:
get_vectors(new_model, dft_tagged)[1]