In [37]:
import psycopg2
import numpy as np
from bs4 import BeautifulSoup
from pymongo import MongoClient, errors
import json
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

from sklearn.decomposition import LatentDirichletAllocation

In [38]:
conn = psycopg2.connect(dbname='therapist_predictor', user='postgres', host='localhost', password='password')

In [39]:
sql = "select * from therapists;"
df = pd.read_sql_query(sql, conn)
conn = None
df.head()

Unnamed: 0,therapist_id,first_name,last_name,street,primary_credential,license_status,website,info_source,creation_date,verified,...,years_in_practice,school,year_graduated,writing_sample,full_name,html_source_code,phone,state,city,zip_code
0,82,Bernadine,Merker,7000 E Belleview Ave Ste 350,Licensed Clinical Social Worker - CSW00992525,I'm a licensed professional.,http:www.merkercounseling.com,goodtherapy,2020-05-12 19:18:03.962988,True,...,,,,I have a masters degree in both Education and ...,Bernadine Merker,,303-770-0940,Colorado,Greenwood Village,80111
1,8,Jennifer,Adams,950 South Cherry Street,Psychologist - 3123,I'm a licensed professional.,http://www.bloomhealthdenver.com/,goodtherapy,2020-05-12 00:26:43.288912,True,...,,,,~Are you hoping to become a mother but are hav...,Jennifer Harned Adams,,303-325-1633,Colorado,Denver,80206
2,9,Eric,Eichler,1155 Sherman St,LCSW - CSW.09925366,I'm a licensed professional.,http://www.edgepsychotherapy.com,goodtherapy,2020-05-12 00:29:44.939505,True,...,,,,Are you struggling with aligning your life act...,Eric Eichler,,720-753-5801,Colorado,Denver,80203
3,83,Angie,Douglas,"12157 W.Cedar Dr, Ste 200",LPC - 0013676,I'm a licensed professional.,,goodtherapy,2020-05-12 19:18:06.295429,True,...,,,,You are unique and deserve to learn in the sty...,Angie Douglas,,720-419-1693,Colorado,Lakewood,80228
4,84,Jessica,Dolgan,1756 High Street,,I'm a licensed professional.,http:TherapyWithITS.com,goodtherapy,2020-05-12 19:18:09.370975,False,...,,,,PHILOSOPHY:\r We're a practice dedicated to se...,Jessica Dolgan,,303-388-8144,Colorado,Denver,80218


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 184 entries, 0 to 183
Data columns (total 2 columns):
therapist_id      184 non-null int64
writing_sample    184 non-null object
dtypes: int64(1), object(1)
memory usage: 3.0+ KB


In [24]:
# tokenizing with nltk, with stemming/lemmatizing
# this keeps punctuation
def tokenize_doc(set_of_docs)->list:
    return [word_tokenize(content.lower()) for content in set_of_docs ]

def remove_stopwords_and_punc(tokenized_set_of_docs:list, stop_words, remove_punc=True)->list:
    no_stops_docs = []
    for tokens in tokenized_set_of_docs:
        saved_tokens = []
        for token in tokens:
            if remove_punc:
                if token.isalpha():
                    if token not in stop_words:
                        saved_tokens.append(token)
            else:
                if token not in stop_words:
                    saved_tokens.append(token)
        no_stops_docs.append(saved_tokens)
        #no_stops_docs.append([word for word in words if word not in final_stop_words and word])
    
    return no_stops_docs

def stem_porter(tokenized_set_of_docs:list)->list:
    porter = PorterStemmer()
    porter_docs = []
    for words in tokenized_set_of_docs:
        porter_docs.append([porter.stem(word) for word in words])
    
    return porter_docs

def stem_snowball(tokenized_set_of_docs:list)->list:
    snowball = SnowballStemmer('english')
    snowball_docs = []
    for words in tokenized_set_of_docs:
        snowball_docs.append([snowball.stem(word) for word in words])
    
    return snowball_docs

def lemm_wordnet(tokenized_set_of_docs:list)->list:
    wordnet = WordNetLemmatizer()
    wordnet_docs = []
    for words in tokenized_set_of_docs:
        wordnet_docs.append([wordnet.lemmatize(word) for word in words])
    
    return wordnet_docs

def text_tokenization_pipeline(list_of_docs : list, stop_words, remove_punc : bool, tokenizer='wordnet') -> list:
    tokenized_docs = tokenize_doc(list_of_docs)
    no_stops_docs = remove_stopwords_and_punc(tokenized_docs, stop_words, remove_punc=remove_punc)
    lemm_stemm_docs = []
    if tokenizer == 'wordnet':
        lemm_stemm_docs = lemm_wordnet(no_stops_docs)
    elif tokenizer == 'porter':
        lemm_stemm_docs = stem_porter(no_stops_docs)
    else:
        lemm_stemm_docs = stem_snowball(no_stops_docs)
        
    return lemm_stemm_docs

def remove_dupes_decorator(func):
    def func_wrapper(*args, **kwargs):
        dupes_removed_docs = func(*args, **kwargs)
        
        return [list(set(ls)) for ls in dupes_removed_docs]

    return func_wrapper

def get_stop_words(custom_stop_words):
    stop_words = set(stopwords.words('english'))

    return stop_words.union(custom_stop_words)

def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-num_top_words - 1:-1]]))

In [25]:
custom_stop_words = []
final_stop_words = get_stop_words(custom_stop_words)
observed_tokens = text_tokenization_pipeline(df['writing_sample'],stop_words=final_stop_words, remove_punc=True, tokenizer='wordnet')

documents = [' '.join(doc) for doc in observed_tokens]

# tfidf_vect = TfidfVectorizer(lowercase=False)
# tfidf_matrix = tfidf_vect.fit_transform(documents)
#print(tfidf_vect.get_feature_names())

count_vect = CountVectorizer(max_features=1000)
tf_matrix = count_vect.fit_transform(documents)

tf_feature_names = count_vect.get_feature_names()

num_topics = 5
lda = LatentDirichletAllocation(n_components=num_topics, learning_offset =50., verbose=1,
                                doc_topic_prior=0.9, topic_word_prior= 0.9,
                                n_jobs=-1, learning_method = 'online',
                                random_state=0)

lda.fit(tf_matrix)

num_top_words = 10
print(display_topics(lda, tf_feature_names, num_top_words))

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
Topic 0:
therapy life help client relationship work experience family people together
Topic 1:
therapy life work help relationship client experience therapist family people
Topic 2:
life feel work want feeling help like emotion together people
Topic 3:
life help therapy work client change need experience people relationship
Topic 4:
life experience therapy help work relationship feel client feeling one
None


In [48]:
# this is a sparse matrix - 
# so tf_matrix[0:10, 0:10] doesn't work
# instead:
tf_matrix.todense()[0:10, 0:10]

matrix([[0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 2, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [36]:
# remember, this give the INDEX of the word, it is not a count
count_vect.vocabulary_

{'master': 560,
 'degree': 229,
 'education': 276,
 'social': 803,
 'work': 987,
 'solution': 806,
 'provide': 695,
 'tool': 898,
 'enhance': 301,
 'client': 147,
 'life': 525,
 'believe': 93,
 'one': 623,
 'technique': 874,
 'fit': 358,
 'people': 655,
 'approach': 58,
 'everyone': 319,
 'unique': 938,
 'need': 604,
 'exploring': 333,
 'bring': 110,
 'self': 774,
 'acceptance': 5,
 'growth': 401,
 'change': 135,
 'include': 463,
 'anything': 56,
 'imagery': 456,
 'music': 596,
 'making': 554,
 'homework': 440,
 'grow': 400,
 'looking': 542,
 'family': 345,
 'hold': 436,
 'let': 522,
 'go': 389,
 'establish': 310,
 'reach': 716,
 'goal': 390,
 'support': 857,
 'guidance': 402,
 'together': 897,
 'effective': 278,
 'also': 42,
 'many': 557,
 'needed': 605,
 'resource': 744,
 'community': 169,
 'help': 428,
 'worked': 988,
 'various': 951,
 'problem': 688,
 'supported': 858,
 'process': 689,
 'insurance': 479,
 'blue': 100,
 'mountain': 591,
 'health': 420,
 'plan': 670,
 'behavioral': 9

In [50]:
df.columns

Index(['therapist_id', 'first_name', 'last_name', 'street',
       'primary_credential', 'license_status', 'website', 'info_source',
       'creation_date', 'verified', 'license_num', 'license_state',
       'years_in_practice', 'school', 'year_graduated', 'writing_sample',
       'full_name', 'html_source_code', 'phone', 'state', 'city', 'zip_code'],
      dtype='object')

In [58]:
len(df['primary_credential'].unique())

138

In [64]:
creds = df['primary_credential'].unique()
#creds.sort()
for c in creds:
    print(c)

Licensed Clinical Social Worker - CSW00992525
Psychologist - 3123 
LCSW - CSW.09925366 
LPC - 0013676
None
LMFT
Licensed Professional Counselor Candidate - 08122023
Licensed Professional Counselor
Licensed Clinical Social Worker - 992133
Licensed Professional Counselor - 0015347
Licensed Professional Counselor - LPC-4858
Registered NLC CO
Registered Psychotherapist
Marriage and Family Therapist Candidate
Licensed Professional Counselor - LPC.0015594
LMFT - 0001420
Licensed Social Worker - LSW.0009922613
Clinical Psychologist
LPCC - LPCC.0014990
Licensed Professional Counselor - LPC.0016002
LPCC
LPC - LPC.0012737
LCSW - 1276
Licensed Professional Counselor Candidate - LPCC.0015516
Licensed Professional Counselor - 4518
MAMFT and Certified Play Therapist - #103286
LPC
Licensed Marriage & Family Therapist - 0001295
Professional Counselor - 0014629
Psychologist - PSY.0002890
LPC - 5592
Licensed Professional Counselor (LPC) - 0013388
Psychologist
Psychologist - 3279
Licensed Marriage and Fa

In [54]:
writing_lengths = []
for body in df['writing_sample']:
    writing_lengths.append(len(body))
    
writing_lengths.sort()
writing_lengths

[205,
 241,
 248,
 273,
 322,
 440,
 460,
 460,
 491,
 518,
 536,
 577,
 593,
 619,
 653,
 681,
 682,
 682,
 703,
 794,
 807,
 809,
 885,
 898,
 905,
 911,
 958,
 987,
 1015,
 1041,
 1059,
 1100,
 1123,
 1124,
 1144,
 1176,
 1191,
 1205,
 1206,
 1216,
 1224,
 1228,
 1242,
 1250,
 1330,
 1340,
 1350,
 1358,
 1385,
 1398,
 1401,
 1405,
 1406,
 1431,
 1440,
 1448,
 1452,
 1468,
 1482,
 1489,
 1489,
 1490,
 1493,
 1498,
 1527,
 1533,
 1562,
 1563,
 1572,
 1576,
 1577,
 1613,
 1616,
 1619,
 1627,
 1636,
 1641,
 1652,
 1654,
 1681,
 1691,
 1691,
 1699,
 1699,
 1723,
 1726,
 1733,
 1738,
 1754,
 1768,
 1769,
 1794,
 1802,
 1815,
 1863,
 1895,
 1951,
 1982,
 2011,
 2014,
 2017,
 2026,
 2037,
 2052,
 2066,
 2072,
 2073,
 2078,
 2088,
 2119,
 2138,
 2147,
 2147,
 2198,
 2235,
 2247,
 2259,
 2299,
 2299,
 2310,
 2352,
 2383,
 2387,
 2402,
 2421,
 2422,
 2435,
 2490,
 2499,
 2530,
 2531,
 2553,
 2556,
 2590,
 2613,
 2702,
 2708,
 2713,
 2730,
 2786,
 2789,
 2816,
 2854,
 2862,
 2866,
 2885,
 2927,