# Loading data and preliminary analysis

In [1]:
# Install dependencies 

%pip install -r '../requirements.txt'

import nltk 

nltk.download('punkt') 
nltk.download('averaged_perceptron_tagger') 

# Set paths 

path_to_csv = '../Dataset/Mental_Health_FAQ.csv'

Defaulting to user installation because normal site-packages is not writeable
Collecting cachetools==5.3.2 (from -r ../requirements.txt (line 3))
  Using cached cachetools-5.3.2-py3-none-any.whl.metadata (5.2 kB)
Collecting certifi==2023.11.17 (from -r ../requirements.txt (line 4))
  Using cached certifi-2023.11.17-py3-none-any.whl.metadata (2.2 kB)
Collecting click==8.1.7 (from -r ../requirements.txt (line 6))
  Using cached click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting contourpy==1.1.1 (from -r ../requirements.txt (line 7))
  Using cached contourpy-1.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.9 kB)
Collecting cycler==0.12.1 (from -r ../requirements.txt (line 8))
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting flatbuffers==23.5.26 (from -r ../requirements.txt (line 9))
  Using cached flatbuffers-23.5.26-py2.py3-none-any.whl.metadata (850 bytes)
Collecting fonttools==4.47.2 (from -r ../requirements.txt (line 10))
  

[nltk_data] Downloading package punkt to /home/ahmed244/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ahmed244/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
import pandas as pd
import nltk 
import numpy as np
import re

from nltk.stem import wordnet                                  # to perform lemmitization
from sklearn.feature_extraction.text import CountVectorizer    # to perform bow
from sklearn.feature_extraction.text import TfidfVectorizer    # to perform tfidf
from nltk import pos_tag                                       # for parts of speech
from sklearn.metrics import pairwise_distances                 # to perfrom cosine similarity
from nltk import word_tokenize                                 # to create tokens
from nltk.corpus import stopwords                              # for stop words

In [3]:
df = pd.read_csv(path_to_csv, nrows = 20)
df.head()

Unnamed: 0,Question_ID,Questions,Answers
0,1590140,What does it mean to have a mental illness?,Mental illnesses are health conditions that di...
1,2110618,Who does mental illness affect?,It is estimated that mental illness affects 1 ...
2,6361820,What causes mental illness?,It is estimated that mental illness affects 1 ...
3,9434130,What are some of the warning signs of mental i...,Symptoms of mental health disorders vary depen...
4,7657263,Can people with mental illness recover?,"When healing from mental illness, early identi..."


In [4]:
df.isnull().sum()

Question_ID    0
Questions      0
Answers        0
dtype: int64

# Clean data using NLTK






In [9]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

from nltk import pos_tag, word_tokenize
s = "Ahmed is studying Artificial Intelligence."
tokens = word_tokenize(s)
print(pos_tag(tokens))


[nltk_data] Downloading package punkt to /home/ahmed244/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ahmed244/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('Ahmed', 'NNP'), ('is', 'VBZ'), ('studying', 'VBG'), ('Artificial', 'JJ'), ('Intelligence', 'NNP'), ('.', '.')]


In [10]:
import nltk
nltk.download('omw-1.4')
nltk.download('wordnet')  # just in case

from nltk.stem import WordNetLemmatizer

lemma = WordNetLemmatizer()
print(lemma.lemmatize('absorbed', pos='v'))


absorb


[nltk_data] Downloading package omw-1.4 to /home/ahmed244/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ahmed244/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
# nltk.download('wordnet')                    # uncomment if running the cell for the first time
lemma = wordnet.WordNetLemmatizer()         
lemma.lemmatize('absorbed', pos = 'v')        # lemmatize words

'absorb'

In [12]:
pos_tag(nltk.word_tokenize(s),tagset = None)       # returns the parts of speech of every word

[('Ahmed', 'NNP'),
 ('is', 'VBZ'),
 ('studying', 'VBG'),
 ('Artificial', 'JJ'),
 ('Intelligence', 'NNP'),
 ('.', '.')]

In [13]:
nltk.download('stopwords')            # uncomment if running the cell for the first time

stop = stopwords.words('english')
print(stop)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ahmed244/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
# function that performs text normalization steps and returns the lemmatized tokens as a sentence

def text_normalization(text):
    text = str(text).lower()                        # text to lower case
    spl_char_text = re.sub(r'[^ a-z]','',text)      # removing special characters
    tokens = nltk.word_tokenize(spl_char_text)      # word tokenizing
    lema = wordnet.WordNetLemmatizer()              # intializing lemmatization
    tags_list = pos_tag(tokens,tagset=None)         # parts of speech
    lema_words = []                                 # empty list 
    for token,pos_token in tags_list:               # lemmatize according to POS
        if pos_token.startswith('V'):               # Verb
            pos_val = 'v'
        elif pos_token.startswith('J'):             # Adjective
            pos_val = 'a'
        elif pos_token.startswith('R'):             # Adverb
            pos_val = 'r'
        else:
            pos_val = 'n'                           # Noun
        lema_token = lema.lemmatize(token,pos_val)

        if lema_token in stop: 
          lema_words.append(lema_token)             # appending the lemmatized token into a list
    
    return " ".join(lema_words) 

In [15]:
text_normalization('telling you some stuffs about me')  # example

'you some about me'

In [16]:
df['lemmatized_text'] = df['Questions'].apply(text_normalization)   # clean text
df.head(5)

Unnamed: 0,Question_ID,Questions,Answers,lemmatized_text
0,1590140,What does it mean to have a mental illness?,Mental illnesses are health conditions that di...,what do it to have a
1,2110618,Who does mental illness affect?,It is estimated that mental illness affects 1 ...,who do
2,6361820,What causes mental illness?,It is estimated that mental illness affects 1 ...,what
3,9434130,What are some of the warning signs of mental i...,Symptoms of mental health disorders vary depen...,what be some of the of
4,7657263,Can people with mental illness recover?,"When healing from mental illness, early identi...",can with


In [17]:
cv = CountVectorizer()                                  # intializing the count vectorizer
X = cv.fit_transform(df['lemmatized_text']).toarray()

In [18]:
# returns all the unique word from data 

features = cv.get_feature_names_out()
df_bow = pd.DataFrame(X, columns = features)
df_bow.head()

Unnamed: 0,about,after,and,be,before,between,can,do,for,have,...,or,should,some,the,this,to,what,where,who,with
0,0,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,1,1,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,1,1,0,0,1,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [19]:
Question = 'What treatment options are available'                           # example

In [20]:
Question_lemma = text_normalization(Question)                               # clean text
Question_bow = cv.transform([Question_lemma]).toarray()                     # applying bow

# Cosine similarity

In [21]:
# cosine similarity for the above question we considered.

cosine_value = 1- pairwise_distances(df_bow, Question_bow, metric = 'cosine' )
(cosine_value)

array([[0.31622777],
       [0.        ],
       [0.70710678],
       [0.5       ],
       [0.        ],
       [0.23570226],
       [0.        ],
       [1.        ],
       [0.31622777],
       [0.70710678],
       [0.        ],
       [0.31622777],
       [0.        ],
       [0.40824829],
       [0.25      ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.70710678],
       [0.        ]])

In [22]:
df['similarity_bow'] = cosine_value                                         # create cosine value as a new column

In [23]:
simiscores = pd.DataFrame(df, columns=['Answers','similarity_bow'])         # taking similarity value of responses for the question we took
simiscores

Unnamed: 0,Answers,similarity_bow
0,Mental illnesses are health conditions that di...,0.316228
1,It is estimated that mental illness affects 1 ...,0.0
2,It is estimated that mental illness affects 1 ...,0.707107
3,Symptoms of mental health disorders vary depen...,0.5
4,"When healing from mental illness, early identi...",0.0
5,Although this website cannot substitute for pr...,0.235702
6,Feeling comfortable with the professional you ...,0.0
7,Just as there are different types of medicatio...,1.0
8,Since beginning treatment is a big step for in...,0.316228
9,There are many types of mental health professi...,0.707107


In [24]:
simscoresDescending = simiscores.sort_values(by = 'similarity_bow', ascending=False)          # sorting the values
simscoresDescending.head()

Unnamed: 0,Answers,similarity_bow
7,Just as there are different types of medicatio...,1.0
2,It is estimated that mental illness affects 1 ...,0.707107
18,There are many types of mental health professi...,0.707107
9,There are many types of mental health professi...,0.707107
3,Symptoms of mental health disorders vary depen...,0.5


In [25]:
threshold = 0.1                                                                         # considering the value of smiliarity to be greater than 0.1
df_threshold = simscoresDescending[simscoresDescending['similarity_bow'] > threshold] 
df_threshold

Unnamed: 0,Answers,similarity_bow
7,Just as there are different types of medicatio...,1.0
2,It is estimated that mental illness affects 1 ...,0.707107
18,There are many types of mental health professi...,0.707107
9,There are many types of mental health professi...,0.707107
3,Symptoms of mental health disorders vary depen...,0.5
13,The best source of information regarding medic...,0.408248
0,Mental illnesses are health conditions that di...,0.316228
8,Since beginning treatment is a big step for in...,0.316228
11,Beginning treatment is a big step for individu...,0.316228
14,It is not uncommon for people to stop taking t...,0.25


In [26]:
index_value = cosine_value.argmax()         # index number of highest value
index_value

7

In [27]:
df['Answers'].loc[index_value]              # The text at the above index becomes the response for the question

'Just as there are different types of medications for physical illness, different treatment options are available for individuals with mental illness. Treatment works differently for different people. It is important to find what works best for you or your child.'

# Tf-Idf

In [28]:
Question1 = 'What treatment options are available'

In [29]:
# using tf-idf

tfidf = TfidfVectorizer()                                             # intializing tf-id 
x_tfidf = tfidf.fit_transform(df['lemmatized_text']).toarray()        # transforming the data into array

In [30]:
Question_lemma1 = text_normalization(Question1)
Question_tfidf = tfidf.transform([Question_lemma1]).toarray()         # applying tf-idf

In [31]:
# returns all the unique word from data with a score of that word

df_tfidf = pd.DataFrame(x_tfidf,columns = tfidf.get_feature_names_out()) 
df_tfidf.head()

Unnamed: 0,about,after,and,be,before,between,can,do,for,have,...,or,should,some,the,this,to,what,where,who,with
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.390535,0.0,0.54821,...,0.0,0.0,0.0,0.0,0.0,0.390535,0.306424,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.580211,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.814466,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.322925,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.480407,0.34906,0.0,0.0,0.236038,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.461191,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.887301


In [32]:
cos = 1-pairwise_distances(df_tfidf,Question_tfidf,metric='cosine')                     # applying cosine similarity
cos

array([[0.18082234],
       [0.        ],
       [0.59010473],
       [0.39999298],
       [0.        ],
       [0.13061405],
       [0.        ],
       [1.        ],
       [0.19009616],
       [0.55866459],
       [0.        ],
       [0.19009616],
       [0.        ],
       [0.20429786],
       [0.23321347],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.62955486],
       [0.        ]])

In [33]:
df['similarity_tfidf'] = cos                                                    # creating a new column 
df_simi_tfidf = pd.DataFrame(df, columns=['Answers','similarity_tfidf'])        # taking similarity value of responses for the question we took
df_simi_tfidf

Unnamed: 0,Answers,similarity_tfidf
0,Mental illnesses are health conditions that di...,0.180822
1,It is estimated that mental illness affects 1 ...,0.0
2,It is estimated that mental illness affects 1 ...,0.590105
3,Symptoms of mental health disorders vary depen...,0.399993
4,"When healing from mental illness, early identi...",0.0
5,Although this website cannot substitute for pr...,0.130614
6,Feeling comfortable with the professional you ...,0.0
7,Just as there are different types of medicatio...,1.0
8,Since beginning treatment is a big step for in...,0.190096
9,There are many types of mental health professi...,0.558665


In [34]:
df_simi_tfidf_sort = df_simi_tfidf.sort_values(by='similarity_tfidf', ascending=False)            # sorting the values
df_simi_tfidf_sort.head(10)

Unnamed: 0,Answers,similarity_tfidf
7,Just as there are different types of medicatio...,1.0
18,There are many types of mental health professi...,0.629555
2,It is estimated that mental illness affects 1 ...,0.590105
9,There are many types of mental health professi...,0.558665
3,Symptoms of mental health disorders vary depen...,0.399993
14,It is not uncommon for people to stop taking t...,0.233213
13,The best source of information regarding medic...,0.204298
8,Since beginning treatment is a big step for in...,0.190096
11,Beginning treatment is a big step for individu...,0.190096
0,Mental illnesses are health conditions that di...,0.180822


In [35]:
threshold = 0.1                                                                                   # considering the value of smiliarity to be greater than 0.1
df_threshold = df_simi_tfidf_sort[df_simi_tfidf_sort['similarity_tfidf'] > threshold] 
df_threshold

Unnamed: 0,Answers,similarity_tfidf
7,Just as there are different types of medicatio...,1.0
18,There are many types of mental health professi...,0.629555
2,It is estimated that mental illness affects 1 ...,0.590105
9,There are many types of mental health professi...,0.558665
3,Symptoms of mental health disorders vary depen...,0.399993
14,It is not uncommon for people to stop taking t...,0.233213
13,The best source of information regarding medic...,0.204298
8,Since beginning treatment is a big step for in...,0.190096
11,Beginning treatment is a big step for individu...,0.190096
0,Mental illnesses are health conditions that di...,0.180822


In [36]:
index_value1 = cos.argmax()                                                   # returns the index number of highest value
index_value1

7

In [37]:
df['Answers'].loc[index_value1]                                               # returns the text at that index

'Just as there are different types of medications for physical illness, different treatment options are available for individuals with mental illness. Treatment works differently for different people. It is important to find what works best for you or your child.'

# Testing chatbot

In [38]:
# defining a function that returns response to query using bow

def chat_bow(text):
    lemma = text_normalization(text) # calling the function to perform text normalization
    bow = cv.transform([lemma]).toarray() # applying bow
    cosine_value = 1- pairwise_distances(df_bow,bow, metric = 'cosine' )
    index_value = cosine_value.argmax() # getting index value 
    return df['Answers'].loc[index_value]

In [39]:
chat_bow('can you prevent mental health problems')

'When healing from mental illness, early identification and treatment are of vital importance. Based on the nature of the illness, there are a range of effective treatments available. For any type of treatment, it is essential that the person affected is proactive and fully engaged in their own recovery process.\nMany people with mental illnesses who are diagnosed and treated respond well, although some might experience a return of symptoms. Even in such cases, with careful monitoring and management of the disorder, it is still quite possible to live a fulfilled and productive life.'

In [40]:
chat_bow('what is mental health')

'Just as there are different types of medications for physical illness, different treatment options are available for individuals with mental illness. Treatment works differently for different people. It is important to find what works best for you or your child.'

In [41]:
chat_bow('are there cures for mental health problems')

'Just as there are different types of medications for physical illness, different treatment options are available for individuals with mental illness. Treatment works differently for different people. It is important to find what works best for you or your child.'

In [42]:
chat_bow('how do I know if i am unwell')

'It is not uncommon for people to stop taking their medication when they feel their symptoms are under control. Others may choose to stop taking their medication because of its side effects, without realizing that most side effects can be effectively managed. While it may seem reasonable to stop taking the medication, the problem is that most often, the symptoms will return. If you or your child is taking medication, it is very important that you work together with your doctor before making decisions about any changes in your treatment.\\nAnother problem with stopping medication, particularly for stopping it abruptly, is that you may develop withdrawal symptoms that can be very unpleasant. If you and your doctor feel a trial off your medicine is a good idea, it is necessary to slowly decrease the dosage of medications so that these symptoms don’t occur.\\nIt is important that your doctor and pharmacist work together to make sure your medications are working safely and effectively. You 

In [43]:
chat_bow('what do you mean by mental health')

'It is estimated that mental illness affects 1 in 5 adults in America, and that 1 in 24 adults have a serious mental illness. Mental illness does not discriminate; it can affect anyone, regardless of gender, age, income, social status, ethnicity, religion, sexual orientation, or background. Although mental illness can affect anyone, certain conditions may be more common in different populations. For instance, eating disorders tend to occur more often in females, while disorders such as attention deficit/hyperactivity disorder is more prevalent in children. Additionally, all ages are susceptible, but the young and the old are especially vulnerable. Mental illnesses usually strike individuals in the prime of their lives, with 75 percent of mental health conditions developing by the age of 24. This makes identification and treatment of mental disorders particularly difficult, because the normal personality and behavioral changes of adolescence may mask symptoms of a mental health conditio

In [44]:
# defining a function that returns response to query using tf-idf

def chat_tfidf(text):
    lemma = text_normalization(text) # calling the function to perform text normalization
    tf = tfidf.transform([lemma]).toarray() # applying tf-idf
    cos = 1-pairwise_distances(df_tfidf,tf,metric='cosine') # applying cosine similarity
    index_value = cos.argmax() # getting index value 
    return df['Answers'].loc[index_value]

In [45]:
chat_tfidf('how do i see a counsellor')

'Some pharmaceutical companies offer prescription assistance programs to individuals and families with financial needs, while others offer special drug discount cards. These programs typically require a doctor’s consent and proof of your financial status. They may also require that you have either no health insurance or no prescription drug benefit through your health insurance.\\nIn addition, there are prescription programs you might qualify for. Visit Healthfinder.gov to learn more.'

In [46]:
chat_tfidf('how to find a support group')

'Some pharmaceutical companies offer prescription assistance programs to individuals and families with financial needs, while others offer special drug discount cards. These programs typically require a doctor’s consent and proof of your financial status. They may also require that you have either no health insurance or no prescription drug benefit through your health insurance.\\nIn addition, there are prescription programs you might qualify for. Visit Healthfinder.gov to learn more.'