In [3]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

#nltk.download('averaged_perceptron_tagger', 'punkt', 'stopwords')

import pandas as pd 
import os 
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score 
import string
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
import re 
from tqdm import tqdm # for progress bar
import matplotlib.pyplot as plt
import random
plt.style.use('fivethirtyeight')

In [4]:
old_dir = os.getcwd()
os.chdir("S:/asanchez/Edward Jones Stuff")

In [5]:
# read in data into dataframe
df = pd.read_excel('InsightExchange_AllQuestionText_misc_removed.xlsx')
df_deduped = df['QuestionText'].drop_duplicates()

In [6]:
df_deduped.head()

0    Approximately how much time do you spend on th...
1    What is the primary reason that you have <u>no...
2    To what extent do you find these communication...
3    Please indicate how much you agree or disagree...
4    When searching for <b>your Edward Jones' Finan...
Name: QuestionText, dtype: object

In [7]:
tqdm.pandas()
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 
all_stopwords = stopwords.words('english')

# add stopwords to nltk stopwords list 
sw_list = ['nptrust', 'org', 'www', 'com', 'hellip', 'ndash', 'target_blank', 
          'hrep', 'rsquo', 'ldquo', 'rdquo', 'pfo flag previous pfo', 
          'toolplease', 'selectedtext', 'pageplease','esg', 'click', 'arrow',
           'please', 'select', 'apply', '_', 'blank']

all_stopwords.extend(sw_list)

In [8]:
def preprocess(sentence):
    '''
    Takes a sentence returns preprocessed stemmed and lemmatized 
    lowercased and filtered words (removes stop words, misc characters and numbers) 
    and returns joined tokens.  
    '''
    sentence=str(sentence)
    sentence = sentence.lower()
    sentence=sentence.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    rem_surv_char = re.sub('![A-Za-z]+\W[A-Za-z]+\d*\w\w+\S[A-Za-z0-9]+!?|\W[A-Z]+\W', '', rem_num)
    rem_pun_char = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", rem_surv_char) #remove punct, URL and @ again 
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_surv_char)  
    filtered_words = [w for w in tokens if len(w) > 2 if not w in all_stopwords]
    stem_words=[stemmer.stem(w) for w in filtered_words]
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    #pos_words= [nltk.pos_tag(w) for w in lemma_words]
    return " ".join(filtered_words)

In [9]:
cleaned_docs = df_deduped.map(lambda s:preprocess(s))

In [10]:
cleaned_docs_df = pd.DataFrame(cleaned_docs)
cleaned_docs = cleaned_docs_df[cleaned_docs_df['QuestionText']!='']
cleaned_docs.to_csv('clean_test.csv')
cleaned_docs = cleaned_docs_df['QuestionText'].to_list()

In [11]:
vectorizer = TfidfVectorizer(
    lowercase = True, # lower case text
    max_features = 100, # max 100 features 
    max_df = 0.8, # words occuring in 80% of docs ignored
    min_df = 5, # min number of words to be included is 5
    ngram_range = (1, 3), # n gram range 
    stop_words = "english" # stop words list removed again 
      )

In [12]:
# pass a list of strings to the vectorizer 
vectors = vectorizer.fit_transform(cleaned_docs)
feature_names = vectorizer.get_feature_names()

In [18]:
# return as a dense representation of "vectors"
dense = vectors.todense()
# make dense list 
denselist = dense.tolist()
all_keywords = []

# append to keywords list if word not empty
for description in denselist:
    x = 0
    keywords = []
    for word in description:
        if word > 0:
            keywords.append(feature_names[x])
        x = x+1
    all_keywords.append(keywords)

# Clusters

In [19]:
'''
# checking for optimal number of clusters 
Sum_of_squared_distances = []

K = range(1, 150)
# Calculate the inertia for the range of K values
for k in K:
    km = KMeans(n_clusters = k)
    km = km.fit(vectors)
    Sum_of_squared_distances.append(km.inertia_)

# plotting optimal number of clusters 
# total within-cluster sum of squares
plt.plot(K, Sum_of_squared_distances, 'bx')
plt.xlabel('k')
plt.ylabel('Sum of squared distances')
plt.title('Elbow Method for Optimal K')
plt.savefig('optimal_number_clusters.png')
plt.show()
'''

"\n# checking for optimal number of clusters \nSum_of_squared_distances = []\n\nK = range(1, 150)\n# Calculate the inertia for the range of K values\nfor k in K:\n    km = KMeans(n_clusters = k)\n    km = km.fit(vectors)\n    Sum_of_squared_distances.append(km.inertia_)\n\n# plotting optimal number of clusters \n# total within-cluster sum of squares\nplt.plot(K, Sum_of_squared_distances, 'bx')\nplt.xlabel('k')\nplt.ylabel('Sum of squared distances')\nplt.title('Elbow Method for Optimal K')\nplt.savefig('optimal_number_clusters.png')\nplt.show()\n"

In [23]:
true_k = 25
model = KMeans(n_clusters = true_k, init = "k-means++", max_iter = 100, n_init = 1)
model.fit(vectors)

KMeans(max_iter=100, n_clusters=25, n_init=1)

In [24]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

In [25]:
folder = "Results_" + str(true_k) + '_Clusters'
os.mkdir(folder)
os.chdir(folder)

# Save results to text file 

In [26]:
results_name = "cluster_analysis_results_" + str(true_k)

with open(results_name + ".txt", "w", encoding = "utf-8") as f:
    for i in range(true_k):
        f.write(f"Cluster {i}")
        f.write("\n")
        for ind in order_centroids[i, :10]: # grab first 10 words for each topic
            f.write(' %s' %terms[ind],)
            f.write("\n")
        f.write("\n")
        f.write("\n")

# Plot Results 

In [27]:
import matplotlib.pyplot as plt 
from sklearn.decomposition import PCA
import random 

In [28]:
def rand_color_list(k):
    color_list = []
    for i in range(k):
        r = lambda: random.randint(0,255)
        color = ('#%02X%02X%02X' % (r(),r(),r()))
        color_list.append(color)
    return color_list

In [29]:
kmean_indices = model.fit_predict(vectors)
pca = PCA(n_components = 2)
colors = rand_color_list(true_k)
scatter_plot_points = pca.fit_transform(vectors.toarray())

In [None]:
x_axis = [o[0] for o in scatter_plot_points]
y_axis = [o[1] for o in scatter_plot_points]

fig, ax = plt.subplots(figsize=(50,50))

ax.scatter(x_axis, 
           y_axis, 
           c = [colors[d] for d in kmean_indices], 
           s=500)

ax.scatter(cen_x, cen_y, marker='^', c=colors, s=70)


for i , txt in enumerate(cleaned_docs):
    ax.annotate(txt[0:10], # number of characters to print in graph for each dot
                (x_axis[i], y_axis[i]))

ax.set_title(str(true_k) + ' Cluster')
plot_name = 'cluster_plot_' + str(true_k)
plt.savefig(plot_name+'.png')
plt.show()



# Create csv with questions and clusters 

In [None]:
predictions = model.predict(vectors)
cleaned_docs = cleaned_docs_df[cleaned_docs_df['QuestionText']!='']
cleaned_docs['class'] = model.labels_[1:]


In [None]:
cleaned_docs

In [None]:
cleaned_docs.to_csv('Questions_'+ str(true_k)+'_clusters.csv')

In [None]:
os.chdir(old_dir)