In [25]:
from collections import Counter
import itertools as it
import pandas as pd
import os, re

# location of CSV to target
data_dir = "C:\\Users\\Zjaffen1\\Desktop\\Projects\\data\\rasa messages"

# target CSV
csv = "525_30day_activate_phone_trimmed.csv"

# new directory to save results to
dir_name = "activate_phone_secondary_intents"
os.chdir(data_dir)
os.makedirs(dir_name, exist_ok=True)

In [26]:
def get_second_intent(intent_list):
    if len(intent_list) > 1:
        end = intent_list[1].find(",")
        return intent_list[1][6:end]

def get_second_confidence(intent_list):
    if len(intent_list) > 1:
        start = intent_list[1].find(",") + 13
        end = intent_list[1].find(")")
        return float(intent_list[1][start:end])


In [27]:
# dataframe for CSV
messages_df = pd.read_csv(csv)

# turn intents string into list
messages_df['intent_ranking'] = messages_df['intent_ranking'].apply(lambda x: re.findall('\(.*?\)', x))

# new df columns for secondary intent and confidence
messages_df['secondary_intent'] = messages_df['intent_ranking'].apply(get_second_intent)
messages_df['secondary_confidence'] = messages_df['intent_ranking'].apply(get_second_confidence)

# check number of messages for intent
# print(len(messages_df))

In [28]:
# list of different secondary intents
intents = messages_df.secondary_intent.unique()

# loop to go through each intent and create file for their text samples and rank occurences
sec_intents_ranking = []
os.chdir(dir_name)
os.makedirs('intent_texts', exist_ok=True)
for intent in intents:
    temp_df = messages_df[messages_df['secondary_intent'] == intent]
    total = len(temp_df)
    mean = temp_df['secondary_confidence'].mean()
    median = temp_df['secondary_confidence'].median()
    sec_intents_ranking.append((intent, total, mean, median))

    # for csvs with messages
    messages_loc = "intent_texts\\" + intent + "_messages.csv"
    temp_df = temp_df[['input_text','secondary_confidence']]
    temp_df = temp_df.sort_values(by='secondary_confidence', ascending=False)
    temp_df.to_csv(messages_loc, index=False, encoding='utf-8')

print("finished exporting secondary intents text")

finished exporting secondary intents text


## rank top secondary intents for given intent CSV

In [29]:
sec_intents_ranking.sort(key = lambda x: x[1], reverse=True)
s_intents_df = pd.DataFrame(sec_intents_ranking, columns=['Intent', 'Num Occurences', 'Avg Confidence', 'Median Confidence'])
ranked_name = 'ranked_' + dir_name + ".csv"
s_intents_df.to_csv(ranked_name, index=False, encoding='utf-8')
print("finished secondary intents rankings for " + csv)

finished secondary intents rankings for 525_30day_activate_phone_trimmed.csv


## make lists of top words for each secondary intent to find confusion vocab

In [33]:
# list of words to ignore
stopwords = ["i", "im", "hey", "hello", "hi", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "u", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "dont", "should", "now", "tmobile", "need", "like", "would", "get", "trying", "new", "know", "got", "use", "one", "go", "g", "cant", "please"]

# maybe want more words to ignore?
stopwords += []

# remove dupes
stopwords = set(stopwords)

In [34]:
# list of files from secondary intent texts for word review
texts_dir = data_dir + "\\" + dir_name + "\\intent_texts"
text_paths = []
for dirpath, subdirs, files in os.walk(texts_dir):
    for file in files:
        text_paths.append(os.path.join(dirpath, file))

In [35]:

start = len(texts_dir)
# target CSV and directory to save results to
word_results = []
for filepath in text_paths:

    intent = filepath[start+1:-13] + ":"
    messages_df = pd.read_csv(filepath)
    input_texts = messages_df['input_text'].tolist()
    text_words = [re.sub('[^A-Z a-z]+', '', text).lower().split() for text in input_texts]
    words = list(it.filterfalse(lambda word: word in stopwords or len(word)>15, it.chain.from_iterable(text_words)))
    word_results.append((intent, list(Counter(words).most_common(15))))

# remove intents with no matches
pops = []
for i in range(len(word_results)):
    if len(word_results[i][1]) == 0:
        pops.append(i)
for val in pops:
    word_results.pop(val)

word_results.sort(key = lambda x: x[1][0][1], reverse=True)
with open("intents_top_words.txt", "w+") as words_file:
    for intent in word_results:
        words_file.write(intent[0] + '\n' + str(intent[1]) + '\n\n')
    words_file.close()
print("Done counting top words for " + csv)

Done counting top words for 525_30day_activate_phone_trimmed.csv
