In [1]:
#https://radimrehurek.com/gensim/auto_examples/core/run_topics_and_transformations.html

import gensim
import pandas as pd
import numpy as np
import spacy
import operator
import re
import string
import json
import codecs

df_schemes = pd.read_csv('../df.csv', encoding='cp1252')#Needed to map the scheme names back to 
dictionary = gensim.corpora.Dictionary.load('dictionary') #Needed to construct mappings from BOW of the query term to the dictionary which is already preloaded

schemes_tfidf_model = gensim.models.TfidfModel.load("tfidf.model") #Needed to feed into the LSI model

schemes_lsi_model = gensim.models.LsiModel.load("lsi.model") #Final model
schemes_lsi_corpus = gensim.corpora.MmCorpus('schemes_lsi_model_mm') #Needed to create the matrix similarity index

#For mentalhealth
import pickle
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer

tvec_optimised = pickle.load(open('tvec', 'rb'))
mhmodel = pickle.load(open('mentalhealth', 'rb'))

In [2]:
%time spacy_nlp = spacy.load('en_core_web_sm')

#create list of punctuations and stopwords
punctuations = string.punctuation
stop_words = spacy.lang.en.stop_words.STOP_WORDS
to_delete = ["alone","themselves"]
for elem in to_delete:
    stop_words.discard(elem)
stop_words.add("client")

def spacy_tokenizer(sentence):
    #remove distracting single quotes
    sentence = re.sub('\'','',sentence)
    #replace extra spaces with single space
    sentence = re.sub(' +',' ',sentence)
    #remove unwanted lines starting from special charcters
    sentence = re.sub(r'\n: \'\'.*','',sentence)
    sentence = re.sub(r'\n!.*','',sentence)
    sentence = re.sub(r'^:\'\'.*','',sentence)  
    #remove non-breaking new line characters
    sentence = re.sub(r'\n',' ',sentence)    
    #remove punctuations
    sentence = re.sub(r'[^\w\s]',' ',sentence)   
    #creating token object
    tokens = spacy_nlp(sentence)  
    #lower, strip and lemmatize
    tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens] 
    #remove stopwords, and exclude words less than 2 characters
    tokens = [word for word in tokens if word not in stop_words and word not in punctuations and len(word) > 2]
    #return tokens
    return tokens

#create stemmer for mentalhealth
porter = PorterStemmer()
def stemSentence(sentence):
    token_words=word_tokenize(sentence)
    token_words
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

Wall time: 1.71 s


In [61]:
from gensim.similarities import MatrixSimilarity

%time schemes_index = MatrixSimilarity(schemes_lsi_corpus, num_features = schemes_lsi_corpus.num_terms)

#Search similarity

from operator import itemgetter

counter = 0

def search_similar_schemes(search_term):
    global counter
    query_bow = dictionary.doc2bow(spacy_tokenizer(search_term))
    query_tfidf = schemes_tfidf_model[query_bow]
    query_lsi = schemes_lsi_model[query_tfidf]

    schemes_index.num_best = 50

    schemes_list = schemes_index[query_lsi]
    schemes_list.sort(key=itemgetter(1), reverse=True)
    schemes_names = []

    for j, scheme in enumerate(schemes_list):

        schemes_names.append (
            {
                'Relevance': round((scheme[1] * 100),2),
                'Scheme': df_schemes['Scheme'][scheme[0]],
                'Description': df_schemes['Description'][scheme[0]],
                'Agency': df_schemes['Agency'][scheme[0]],
                'Image': df_schemes['Image'][scheme[0]],
                'Link': df_schemes['Link'][scheme[0]],
                'What it gives': df_schemes['What it gives'][scheme[0]],
                'Scheme Type': df_schemes['What it gives'][scheme[0]]
            }

        )
        if j == (schemes_index.num_best-1):
            break

    #for MH
    mhprob = mhmodel.predict_proba(tvec_optimised.transform([str(stemSentence(search_term))]).todense())[0][1]
    
    output = pd.DataFrame(schemes_names, columns=['Relevance','Scheme','Description', 'Agency', 'Image', 'Link', 'What it gives', 'Scheme Type'])
    output['Relevance'] = output.apply(lambda x: (x['Relevance'] * 1.05 ) if ((('mental health' or 'counselling' or 'emotional care' or 'casework' in x['Scheme Type'].lower()) or
                                                                              ('mental health' or 'counselling' or 'emotional care' or 'casework' in x['What it gives'].lower())) and 
                                                                              (mhprob > 0.55)) else x['Relevance'], axis=1)
    output = output.sort_values(by=['Relevance'], ascending= False)
    output = output[output['Relevance']>20]
    jsonobject = output.to_json(orient = "records") #.encode('unicode-escape').decode('unicode-escape')
    counter = counter + 1
    jsonobject = { 
        "mh": mhprob,
        "number_requests_till_date": counter,
        "data": json.loads(jsonobject) 
    }
    return jsonobject

Wall time: 380 ms


In [70]:
search_similar_schemes('trauma')

{'mh': 0.46260387811634357,
 'number_requests_till_date': 9,
 'data': [{'Relevance': 89.96,
   'Scheme': 'HCSA Dayspring Residential Treatment Centre',
   'Description': 'Therapeutic Group Home service model for teenage girls who have suffered the complex trauma of physical, sexual or emotional abuse. The model consists of two evidence-based practices namely Trauma Systems Therapy (TST) and Residential Management System (RMS).',
   'Agency': 'HCSA Community Services',
   'Image': 'https://chidnast.sirv.com/SchemesSG/hcsa.jpg',
   'Link': 'https://www.hcsa.org.sg/programmes/dayspring-rtc/',
   'What it gives': 'Educational programmes,Emotional care',
   'Scheme Type': 'Educational programmes,Emotional care'},
  {'Relevance': 31.87,
   'Scheme': 'Sexual Assualt Care Centre',
   'Description': 'The Care Centre provides safe, free and confidential services for anyone who has faced sexual assault and/or sexual harassment, even if it happened years ago. If you need help, or feel unsure about