In [107]:
import psycopg2
import numpy as np
from bs4 import BeautifulSoup
import pandas as pd
from string import punctuation
import matplotlib.pyplot as plt
import datetime as dt

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

from sklearn.decomposition import LatentDirichletAllocation

# see a 2d representation of the data using PCA
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(copy=True, with_mean=True, with_std=True)

In [144]:
def log_lda_results():
    file_log = open('../logs/lda_results_log.txt', 'a')

    header = '\n\n******************** LDA Results ********************'
    date_and_time = 'Timestamp: ' + str(dt.datetime.now())
    str1 = 'test3'
    d = {1:11, 2:22,3:33}
    d_s = ''
    for k, v in d.items():
        d_s += f'{k} : {v}\n'
    L = [header, date_and_time, str1, d_s]
    
    for i in range(len(L)):
        L[i] += '\n'
    file_log.writelines(L)
    file_log.close()
    
log_lda_results()

In [2]:
conn = psycopg2.connect(dbname='therapist_predictor', user='postgres', host='localhost', password='password')

In [3]:
sql = "select * from therapists;"
sql_age = 'SELECT * FROM age_groups;'
sql_issues = 'SELECT  * FROM issues;'
sql_orientations = 'SELECT * FROM orientations'
sql_professions = 'SELECT * FROM professions'
sql_services = 'SELECT * FROM services'

df = pd.read_sql_query(sql, conn)
df_age_groups = pd.read_sql_query(sql_age, conn)
df_issues = pd.read_sql_query(sql_issues, conn)
df_orientations = pd.read_sql_query(sql_orientations, conn)
df_professions = pd.read_sql_query(sql_professions, conn)
df_services = pd.read_sql_query(sql_services, conn)

conn = None
df.head()

Unnamed: 0,therapist_id,first_name,last_name,street,primary_credential,license_status,website,info_source,creation_date,verified,...,years_in_practice,school,year_graduated,writing_sample,full_name,html_source_code,phone,state,city,zip_code
0,82,Bernadine,Merker,7000 E Belleview Ave Ste 350,Licensed Clinical Social Worker - CSW00992525,I'm a licensed professional.,http:www.merkercounseling.com,goodtherapy,2020-05-12 19:18:03.962988,True,...,,,,I have a masters degree in both Education and ...,Bernadine Merker,,303-770-0940,Colorado,Greenwood Village,80111
1,8,Jennifer,Adams,950 South Cherry Street,Psychologist - 3123,I'm a licensed professional.,http://www.bloomhealthdenver.com/,goodtherapy,2020-05-12 00:26:43.288912,True,...,,,,~Are you hoping to become a mother but are hav...,Jennifer Harned Adams,,303-325-1633,Colorado,Denver,80206
2,9,Eric,Eichler,1155 Sherman St,LCSW - CSW.09925366,I'm a licensed professional.,http://www.edgepsychotherapy.com,goodtherapy,2020-05-12 00:29:44.939505,True,...,,,,Are you struggling with aligning your life act...,Eric Eichler,,720-753-5801,Colorado,Denver,80203
3,83,Angie,Douglas,"12157 W.Cedar Dr, Ste 200",LPC - 0013676,I'm a licensed professional.,,goodtherapy,2020-05-12 19:18:06.295429,True,...,,,,You are unique and deserve to learn in the sty...,Angie Douglas,,720-419-1693,Colorado,Lakewood,80228
4,84,Jessica,Dolgan,1756 High Street,,I'm a licensed professional.,http:TherapyWithITS.com,goodtherapy,2020-05-12 19:18:09.370975,False,...,,,,PHILOSOPHY:\r We're a practice dedicated to se...,Jessica Dolgan,,303-388-8144,Colorado,Denver,80218


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 273 entries, 0 to 272
Data columns (total 22 columns):
therapist_id          273 non-null int64
first_name            273 non-null object
last_name             273 non-null object
street                237 non-null object
primary_credential    266 non-null object
license_status        273 non-null object
website               273 non-null object
info_source           273 non-null object
creation_date         273 non-null datetime64[ns]
verified              273 non-null bool
license_num           0 non-null object
license_state         0 non-null object
years_in_practice     0 non-null object
school                0 non-null object
year_graduated        0 non-null object
writing_sample        273 non-null object
full_name             273 non-null object
html_source_code      0 non-null object
phone                 272 non-null object
state                 273 non-null object
city                  273 non-null object
zip_code            

In [137]:
# tokenizing with nltk, with stemming/lemmatizing
# this keeps punctuation
def tokenize_doc(set_of_docs)->list:
    return [word_tokenize(content.lower()) for content in set_of_docs ]

def remove_stopwords_and_punc(tokenized_set_of_docs:list, stop_words, remove_punc=True)->list:
    no_stops_docs = []
    for tokens in tokenized_set_of_docs:
        saved_tokens = []
        for token in tokens:
            if remove_punc:
                if token.isalpha():
                    if token not in stop_words:
                        saved_tokens.append(token)
            else:
                if token not in stop_words:
                    saved_tokens.append(token)
        no_stops_docs.append(saved_tokens)
        #no_stops_docs.append([word for word in words if word not in final_stop_words and word])
    
    return no_stops_docs

def stem_porter(tokenized_set_of_docs:list)->list:
    porter = PorterStemmer()
    porter_docs = []
    for words in tokenized_set_of_docs:
        porter_docs.append([porter.stem(word) for word in words])
    
    return porter_docs

def stem_snowball(tokenized_set_of_docs:list)->list:
    snowball = SnowballStemmer('english')
    snowball_docs = []
    for words in tokenized_set_of_docs:
        snowball_docs.append([snowball.stem(word) for word in words])
    
    return snowball_docs

def lemm_wordnet(tokenized_set_of_docs:list)->list:
    wordnet = WordNetLemmatizer()
    wordnet_docs = []
    for words in tokenized_set_of_docs:
        wordnet_docs.append([wordnet.lemmatize(word) for word in words])
    
    return wordnet_docs

def text_tokenization_pipeline(list_of_docs : list, stop_words, remove_punc : bool, tokenizer='wordnet') -> list:
    tokenized_docs = tokenize_doc(list_of_docs)
    no_stops_docs = remove_stopwords_and_punc(tokenized_docs, stop_words, remove_punc=remove_punc)
    lemm_stemm_docs = []
    if tokenizer == 'wordnet':
        lemm_stemm_docs = lemm_wordnet(no_stops_docs)
    elif tokenizer == 'porter':
        lemm_stemm_docs = stem_porter(no_stops_docs)
    elif tokenizer == 'snowball':
        lemm_stemm_docs = stem_snowball(no_stops_docs)
    else:
        lemm_stemm_docs = no_stops_docs
        
    return lemm_stemm_docs

def remove_dupes_decorator(func):
    def func_wrapper(*args, **kwargs):
        dupes_removed_docs = func(*args, **kwargs)
        
        return [list(set(ls)) for ls in dupes_removed_docs]

    return func_wrapper

def get_stop_words(custom_stop_words):
    stop_words = set(stopwords.words('english'))

    return stop_words.union(custom_stop_words)

def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        x = " - ".join([feature_names[i]
                        for i in topic.argsort()[:-num_top_words - 1:-1]])
        print(x)
        
def display_topics2(model, feature_names, num_top_words)->None:
        for topic_idx, topic in enumerate(model.components_):
            topic_header = "Topic %d:" % (topic_idx)
            topic_words_list = " - ".join([feature_names[i]
                            for i in topic.argsort()[:-num_top_words - 1:-1]])
            print(topic_header)
            print(topic_words_list)

In [138]:
custom_stop_words = ['change',
 'family',
 'find',
 'approach',
 'couples',
 'issues',
 'also',
 'anxiety',
 'working',
 'experience',
 'relationship',
 'relationships',
 'therapist',
 'counseling',
 'people',
 'feel',
 'clients',
 'help',
 'work',
 'life',
 'therapy','psychotherapy', 'feel', 'feeling','get', 'warson', 'counseling', 'way', 'practice']
#custom_stop_words = []
final_stop_words = get_stop_words(custom_stop_words)
tokens = text_tokenization_pipeline(df['writing_sample'],stop_words=final_stop_words,
                                             remove_punc=True, tokenizer='none')

documents = [' '.join(doc) for doc in tokens]

# tfidf_vect = TfidfVectorizer(lowercase=False)
# tfidf_matrix = tfidf_vect.fit_transform(documents)
#print(tfidf_vect.get_feature_names())

count_vect = CountVectorizer(max_features=1000, ngram_range=(3,3))
tf_matrix = count_vect.fit_transform(documents)

tf_feature_names = count_vect.get_feature_names()

num_topics = 5
lda = LatentDirichletAllocation(n_components=num_topics, learning_offset = 50., verbose=1,
                                doc_topic_prior=1/num_topics, topic_word_prior= 1/num_topics,
                                n_jobs=-1, learning_method = 'online',
                                random_state=0)

lda.fit(tf_matrix)

num_top_n_grams = 10
print(display_topics(lda, tf_feature_names, num_top_n_grams))
print("\nModel perplexity: {0:0.3f}".format(lda.perplexity(tf_matrix)))

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
Topic 0:
free phone consultation - many different approaches - make necessary changes - takes lot guts - someone telling giving - children adolescents families - much compassion understanding - probably even make - power trail steps - private office located
Topic 1:
national certified counselor - american association marriage - call today discuss - loveland fort collins - day within hours - locations loveland fort - cultural backgrounds sexual - heal past trauma - listen without judgment - fort collins windsor
Topic 2:
licensed professional counselor - call today schedule - today schedule appointment - offers free initial - offer free consultation - professional counselor lpc - reimbursement insuranc

In [126]:
type(tf_matrix)

scipy.sparse.csr.csr_matrix

In [122]:
lda.components_.shape

(5, 1000)

In [125]:
tf_matrix.shape

(273, 1000)

In [90]:
def get_most_freq_words(count_vectorizer, tf_matrix, num_words, print_dict_view=False):
    top_words = []
    word_freqs = []
    
    word_list = count_vect.get_feature_names();
    count_list = tf_matrix.toarray().sum(axis=0)

    #combine these in a dictionary
    word_freq_dict = dict(zip(word_list, count_list))
    
    # dictionary.values() and .keys() return a view object, so we have to cast it to list in order to use it as desired
    for word_index in np.argsort(list(word_freq_dict.values()))[-num_words:]:
        top_words.append(list(word_freq_dict.keys())[word_index])
        word_freqs.append(list(word_freq_dict.values())[word_index])
        if print_dict_view:
            print(f'{list(word_freq_dict.keys())[word_index]} : {list(word_freq_dict.values())[word_index]}')
        
    return top_words, word_freqs

In [91]:
words, counts = get_most_freq_words(count_vect, tf_matrix, 20)
words

['change',
 'family',
 'find',
 'approach',
 'couples',
 'issues',
 'also',
 'anxiety',
 'working',
 'experience',
 'relationship',
 'therapist',
 'counseling',
 'people',
 'feel',
 'clients',
 'help',
 'work',
 'life',
 'therapy']

In [70]:
word_list = count_vect.get_feature_names();
count_list = tf_matrix.toarray().sum(axis=0)

#combine these in a dictionary
word_freq_dict = dict(zip(word_list, count_list))

# dictionary.values() returns a view object, so we have to cast it to list in order to use it as desired
list(word_freq_dict.values())[:10]

[45, 86, 44, 34, 31, 16, 15, 14, 10, 57]

In [78]:
d = {1:11, 2:22, 3:33, 4:44}
np.array(list(d.values()))[-2:]

array([33, 44])

In [71]:
#sort the dict - first sort the dict values with argsort, 
# which return the index of the value, not the actual value
np.argsort(list(word_freq_dict.values()))

array([910, 928, 752, 592, 303, 716, 918, 177, 856, 472, 181, 837, 767,
       331, 479,  70, 662, 805,  61, 844, 736, 621, 100, 971, 590, 470,
       846, 458, 559, 563, 816, 273, 567, 925, 249, 280, 660, 126, 920,
       854, 121, 288, 962, 443, 468, 830, 581, 469, 921, 749, 157,  42,
        15, 493,  22, 791, 711, 504,  39, 896, 384, 396,  30, 366, 673,
       870, 869, 510, 991, 381,  44, 996, 798, 644, 348, 779, 389, 119,
       876, 114, 753, 939, 466, 113, 110, 490, 926,  12, 545, 150, 265,
       763, 820, 160, 825, 143,  13, 383, 138, 956, 959,   8, 922, 506,
        11, 573, 679, 889, 742, 476, 799, 602, 862, 740, 108, 325,  64,
       236, 772,  72, 350, 376, 806, 407, 709,  36, 600, 599, 810,  49,
       640, 940, 201,  29, 240, 727,  32, 307, 976, 310, 701, 190, 259,
       481, 924, 513, 524, 464, 672, 518, 949, 828, 491, 713, 174, 229,
       840, 297, 409,  16, 786, 687, 688,  20, 260, 776,  69, 804, 789,
       394, 616,  58, 406, 336, 317,  10,  87, 302, 720, 269, 45

In [51]:
print(count_vect.get_feature_names())
print(tf_matrix.toarray())

['ability', 'able', 'abuse', 'accept', 'acceptance', 'accepted', 'accepting', 'access', 'accomplish', 'achieve', 'achieving', 'act', 'action', 'actions', 'active', 'actively', 'actually', 'addiction', 'addictions', 'addition', 'additional', 'address', 'addressing', 'adolescent', 'adolescents', 'adult', 'adults', 'affect', 'ages', 'aim', 'alcohol', 'alive', 'alliance', 'allow', 'allowing', 'allows', 'almost', 'alone', 'along', 'alongside', 'already', 'also', 'although', 'always', 'american', 'andor', 'anger', 'another', 'answer', 'answers', 'anxiety', 'anxious', 'anyone', 'anything', 'appointment', 'appointments', 'approach', 'approaches', 'appropriate', 'area', 'areas', 'arise', 'around', 'art', 'arts', 'ask', 'asking', 'aspects', 'assist', 'association', 'atmosphere', 'attachment', 'attacks', 'attention', 'authentic', 'availability', 'available', 'avoid', 'aware', 'awareness', 'away', 'back', 'background', 'backgrounds', 'bad', 'balance', 'based', 'beautiful', 'become', 'becoming', 'b

In [76]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """return n-gram counts in descending order of counts"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    results=[]
    
    # word index, count i
    for idx, count in sorted_items:
        
        # get the ngram name
        n_gram=feature_names[idx]
        
        # collect as a list of tuples
        results.append((n_gram,count))
 
    return results
#sort the counts of first book title by descending order of counts
sorted_items=sort_coo(tf_matrix[0].tocoo())
 
#Get feature names (words/n-grams). It is sorted by position in sparse matrix
feature_names=count_vect.get_feature_names()
n_grams=extract_topn_from_vector(feature_names,sorted_items,10)

n_grams

[('clients', 6),
 ('work', 4),
 ('people', 4),
 ('build', 3),
 ('anxiety', 3),
 ('tools', 2),
 ('strength', 2),
 ('start', 2),
 ('social', 2),
 ('self', 2)]

In [7]:
# this is a sparse matrix - 
# so tf_matrix[0:10, 0:10] doesn't work
# instead:
tf_matrix.todense()[0:10, 0:10]

matrix([[0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 2, 0, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [9]:
# remember, this give the INDEX of the word, it is not a count
count_vect.vocabulary_

{'master': 548,
 'degree': 235,
 'education': 279,
 'social': 800,
 'work': 987,
 'solution': 802,
 'provide': 691,
 'tool': 893,
 'enhance': 304,
 'client': 153,
 'life': 513,
 'believe': 94,
 'one': 616,
 'technique': 871,
 'fit': 358,
 'people': 648,
 'approach': 56,
 'everyone': 320,
 'unique': 935,
 'need': 592,
 'taken': 864,
 'exploring': 333,
 'bring': 112,
 'self': 772,
 'acceptance': 4,
 'growth': 398,
 'change': 140,
 'include': 450,
 'anything': 54,
 'music': 585,
 'making': 541,
 'homework': 433,
 'grow': 397,
 'looking': 529,
 'family': 344,
 'hold': 429,
 'let': 510,
 'go': 385,
 'establish': 312,
 'reach': 711,
 'goal': 386,
 'support': 853,
 'guidance': 399,
 'together': 892,
 'effective': 281,
 'also': 42,
 'many': 544,
 'needed': 593,
 'resource': 739,
 'community': 174,
 'help': 422,
 'worked': 988,
 'various': 949,
 'problem': 684,
 'supported': 854,
 'process': 685,
 'insurance': 467,
 'blue': 104,
 'mountain': 580,
 'health': 415,
 'plan': 662,
 'behavioral': 92,