# Unsupervised Clustering of Emails Dataset

On the first run, you will need to generate embeddings with a GPU instance. On future runs, set the 'make_embeddings' variable in the cell below to false, and you can use a CPU instance that loads the pre-existing embeddings.

When creating embeddings:
- Instance: g4dn.xlarge
- Image: pytorch 2.0.0 python 3.10 GPU optimized

When loading embeddings:
- Instance: ml.t3.large
- Image: pytorch 2.0.0 python 3.10 CPU optimized

In [None]:
make_embeddings = False
gpu_available = make_embeddings
make_clusters = False

In [None]:
%%capture
!pip install notebook ipykernel pandas python-dotenv sentence_transformers accelerate==0.20.3 scikit-learn seaborn bertopic ipywidgets

### Load documents / embeddings

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
import os
import pickle
import pathlib
import configparser

In [None]:
# Constants
config = configparser.ConfigParser()
config.read('config.ini')
ENCODING = config['global']['ENCODING']
in_path = config['make_pairs']['OUT_FILE']
out_path_emails = config['cluster_emails']['OUT_FILE_EMAILS']
out_path_question_clusters = config['cluster_emails']['OUT_FILE_QUESTION_CLUSTERS']
out_path_answer_clusters = config['cluster_emails']['OUT_FILE_ANSWER_CLUSTERS']
out_path_model = config['cluster_emails']['OUT_PATH_MODEL']
out_path_embeddings = config['cluster_emails']['OUT_PATH_EMBEDDINGS']

In [None]:
def from_csv(in_path):
    """
    Load the emails data
    """
    
    df = pd.read_csv(in_path, encoding=ENCODING, index_col=False, 
                              usecols=['conversation','turn','question','answer'])
    
    questions = np.array(df['question'].tolist())
    answers = np.array(df['answer'].tolist())
    
    combined = []
    
    for q, a in zip(questions, answers):
        combined.append(f'Question: {q}\n\nAnswer: {a}');

    return questions, answers, np.array(combined)
        
        
questions, answers, combined = from_csv(in_path)
texts = {"questions": questions, "answers": answers, "combined": combined}

In [None]:
# Calculate the embeddings
from sentence_transformers import SentenceTransformer

# Lists of embeddings
embedding_names = ['questions', 'answers', 'combined']
embedding_texts = [questions, answers, combined]
model_name = "all-mpnet-base-v2"

def create_embeddings():
    # For each embedding, compute the embedding and save to pickle file
    embeddings = {}

    # Load the base sentence embedding model
    device = 'cuda' if gpu_available else 'cpu'
    embedding_model = SentenceTransformer(model_name, device=device)

    os.makedirs(out_path_embeddings,exist_ok=True)

    ### Create dense vectors
    for name,content in zip(embedding_names,embedding_texts):
        print(f'Computing {name} embeddings')
        embeddings[name] = embedding_model.encode(content)

        print(f'Saving {name} embeddings to directory')
        with open(os.path.join(out_path_embeddings, f'{name}.pkl'), "wb") as f:
            pickle.dump({'embeddings': embeddings[name]}, f, protocol=pickle.HIGHEST_PROTOCOL)
            
    return embeddings
            
def load_embeddings():
    embeddings = {}
    
    for file in pathlib.Path(out_path_embeddings).glob('*.pkl'):
        with open(file, "rb") as f:
            data = pickle.load(f)
            embeddings[file.stem] = data['embeddings']
            print(f'Loaded embeddings {file.stem}')
            
    return embeddings

In [None]:
# Creates or loads embeddings, depending on the options
embeddings = None

if make_embeddings:
    embeddings = create_embeddings()
else:
    embeddings = load_embeddings()

### Cluster documents

In [None]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from hdbscan import HDBSCAN
import umap.umap_ as umap

def cluster_bertopic(texts, embeds):
    # Use vectorizer for topic representations that ignores stop words and includes 2-grams
    vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))
    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
    hdbscan_model = HDBSCAN(min_samples=15, gen_min_span_tree=True, prediction_data=True)

    topic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model)
    topics, probs = topic_model.fit_transform(texts, embeds)
    
    # Reduce the number of topics to at most 120
    topic_model.reduce_topics(texts, nr_topics=120)
    topics = topic_model.get_document_info(texts)["Topic"]
    topic_model.update_topics(texts, topics=topics)
    
    # Reduce outliers from HDBSCAN
    topics = topic_model.reduce_outliers(texts, topics, strategy="embeddings", embeddings=embeds)
    topic_model.outliers_ = 0
    
    # Update topics after reduction
    topic_model.update_topics(texts, topics=topics)
    
    df = pd.DataFrame(texts, columns =['text'])
    df["label"] = topics
    return df, topic_model

cluster = cluster_bertopic

In [None]:
combine_clusters = None
q_clusters = a_clusters = q_model = a_model = c_model = None

def save_bertopic_model(model, suffix):
    embedding_model = f"sentence-transformers/{model_name}"
    model.save(os.path.join(out_path_model,suffix), serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)

def load_bertopic_model(suffix):
    model = BERTopic.load(os.path.join(out_path_model, suffix))
    model._outliers = 0
    return model

if make_clusters:
    print("Clustering questions")
    q_clusters, q_model = cluster(questions, embeddings["questions"])
    print("Clustering answers")
    a_clusters, a_model = cluster(answers, embeddings["answers"])
    print("Clustering combined")
    c_clusters, c_model = cluster(answers, embeddings["combined"])
    combine_clusters = pd.merge(q_clusters, a_clusters, left_index=True, right_index=True, suffixes=('_q', '_a'))
    combine_clusters["label_c"] = c_clusters["label"]
    combine_clusters.to_csv(out_path_emails, encoding=ENCODING)
    
    # Save the models
    save_bertopic_model(q_model, "q_model_base")
    save_bertopic_model(a_model, "a_model_base")
    save_bertopic_model(c_model, "c_model_base")
else:
    print("Loading emails with precomputed clusters")
    combine_clusters = pd.read_csv(out_path_emails, encoding=ENCODING, index_col = False)
    q_model = load_bertopic_model("q_model_base")
    a_model = load_bertopic_model("a_model_base")
    c_model = load_bertopic_model("c_model_base")
    
combine_clusters.head()

### Define some utility functions for the clusters

In [None]:
import matplotlib.pyplot as plt
q_grouped = a_grouped = None

def update_topics():
    """
    Updates the df of topics with new topics after a merge
    Re-applies stop-word removal to automatic topic names
    """
    global combine_clusters, q_grouped, a_grouped
    combine_clusters['label_q'] = q_model.get_document_info(questions)["Topic"]
    combine_clusters['label_a'] = a_model.get_document_info(answers)["Topic"]
    combine_clusters['label_c'] = c_model.get_document_info(combined)["Topic"]
    q_model.update_topics(questions, topics=list(q_model.get_document_info(questions)["Topic"]))
    a_model.update_topics(answers, topics=list(a_model.get_document_info(answers)["Topic"]))
    c_model.update_topics(combined, topics=list(c_model.get_document_info(combined)["Topic"]))
    
    q_grouped = combine_clusters.groupby('label_q')
    a_grouped = combine_clusters.groupby('label_a')

def display_qs(q_cat):
    """
    Displays all questions of the given category
    """
    df_q = q_grouped.get_group(q_cat)
    
    for text in df_q["text_q"]:
        print("\n------------------------------------------------------------\n")
        print(text.strip())
        
def display_as(a_cat):
    """
    Displays all answers of the given category
    """
    df_a = a_grouped.get_group(a_cat)
    
    for text in df_a["text_a"]:
        print("\n------------------------------------------------------------\n")
        print(text.strip())
        
        
def recalc_representative_docs(model, texts):
    """
    Recalculates the representative docs per topic
    """
    documents = pd.DataFrame({"Document": texts, "Topic": model.topics_, "Image": None, "ID": range(len(texts))})
    model._save_representative_docs(documents)
        
def clusters_bar_chart(model, fontsize = 12):
    """
    Generates a bar chart of clusters by number of emails
    """
    model._outliers = 0
    df = model.get_topic_info()
    plt.figure(figsize=(14,8))
    plt.bar(df["Topic"], df["Count"])
    plt.xlabel('Category')
    plt.ylabel('Number of Emails')
    plt.title('advising@ Emails by Category')
    plt.xticks(df["Topic"], df["CustomName"], rotation=45, ha='right', fontsize = fontsize)
    plt.savefig('emails_by_category.png', dpi=300, bbox_inches = "tight")
    plt.show()

def hierarchical_chart(model, docs, custom_labels=False):
    """
    Show a hierarchy of clusters for the model
    """
    hierarchical_topics = model.hierarchical_topics(docs)
    display(model.visualize_hierarchy(hierarchical_topics=hierarchical_topics, custom_labels=custom_labels))
    
update_topics()

### Visualize Question Categories

In [None]:
q_model.get_topic_info()

In [None]:
# Show a dimension-reduced plot of topic similarity
q_model.visualize_topics()

In [None]:
hierarchical_chart(q_model,questions)

#### Merge question topics
Manually merge topics that appear to be the same

In [None]:
topics_to_merge = [
    [49, 46], # internal from other depts
    [94, 18], # graduation check, missing requirements for graduation
    [56, 26, 86, 36], # grab bag of degree requirements clarifications
    [71, 57], # applying for graduation
    [63, 81, 34, 72], # degree navigator issues
    [114, 15, 45], # credit/d/fail
    [48, 88], # EOSC
    [76, 80, 115, 87, 107, 29, 110, 51, 42, 100, 14, 65, 4, 73], # requesting help with registration / waitlist
    [96], # MATH prerequisites / retake / registration
    [1], # failed course / retaking failed course
    [102, 84, 93, 5, 117], # transfer credits
    [13, 61], # transferring AP / IB credits
    [38, 31], # BIOL course prerequisites / requirements
    [113, 47, 104], # changing major / specialization
    [85, 22, 116], # help with specialization application / options
    [82], # honours programs
    [33, 10], # full-time status / course load / credit limit
    [66, 8], # transferring to faculty of science
    [79, 91], # general inquiries about program / degree offerings
    [21, 99], # high school / first year physics requirements
    [95, 111], # calculus 12 requirement
    [41, 44, 60], # requesting appointment / questions about zoom advising
    [6, 59, 9], # registration is blocked
    [55], # other requests regarding registration
    [62, 52, 17], # dropping a course
    [77, 24], # academic concession for illness
    [118], # other difficult situations requiring advising
    [50], # follow-ups and form submissions
    [69, 68], # requesting letter / permission / approval
    [35], # appeals
    [101, 37, 25, 7, 67, 28], # deferred exams
    [16, 58], # other academic concessions
]

q_model.merge_topics(questions, topics_to_merge)
update_topics()

In [None]:
topic_labels = {
    61: "first year registration webinar",
    60: "official transcript",
    53: "GPA rankings / awards",
    63: "Dean's honour list",
    62: "diploma 'with distinction'",
    19: "graduation check / missing graduation requirements",
    30: "applying for graduation",
    59: "withdrawing graduation application",
    49: "minors / credit counting with minors",
    24: "applying for a minor",
    46: "dropping a minor",
    58: "minor course change",
    57: "academic calendar / degree requirements year version",
    9: "degree navigator issues",
    15: "credit / d / fail",
    55: "checking if course has science credit",
    37: "arts requirement",
    13: "communication requirement",
    48: "breadth requirement",
    32: "EOSC course / degree issues",
    51: "MATH prerequisites / retake / registration",
    0: "requesting help with registration / waitlist",
    6: "failed a course / retaking a failed course",
    2: "transfer credits",
    23: "transferring AP / IB credits",
    17: "BIOL course prerequisites / requirements",
    36: "BIOL major requirements",
    22: "changing major or specialization",
    14: "help with specialization application / options",
    43: "honours programs",
    44: "co-op program", 
    56: "registration date / time",
    1: "year promotion requirements", 
    12: "full-time status / course load / credit limit", 
    33: "academic leave / time off", 
    39: "readmission", 
    29: "double major / dual degree", 
    11: "major in computer science", 
    21: "general inquiries about program / degree offerings",
    7: "transferring to faculty of science", 
    45: "second degree program", 
    54: "neurosciene program", 
    25: "high school / first year physics requirements", 
    47: "foundational requirement", 
    52: "calculus 12 requirement",
    16: "requesting appointment / questions about zoom advising", 
    38: "various registration issues",
    4: "registration is blocked", 
    64: "distance education", 
    34: "CPSC course requirements", 
    8: "dropping a course", 
    10: "study permits / unable to return to campus", 
    31: "requesting an online exam", 
    18: "academic concessions for illness",
    35: "withdrawing from UBC", 
    26: "withdrawing from a course", 
    40: "follow-ups / submitting forms", 
    28: "requesting letter / signature / approval", 
    41: "appeals",
    3: "deferred exams",
    20: "other academic concessions",
}

q_model.set_topic_labels(topic_labels)

#### Visualize question topics after merge

In [None]:
# Save the merged model as well
if make_clusters:
    save_bertopic_model(q_model, "q_model_merged")

In [None]:
hierarchical_chart(q_model, questions, True)

In [None]:
clusters_bar_chart(q_model, fontsize = 8)

#### Apply another merge step
There are very many small categories above. To better understand the categories, try applying another merging step.

In [None]:
topics_to_merge = [
    [60, 53, 63, 62], # "transcripts, scholarships, rankings"
    [1, 52, 47, 13, 48, 55, 37, 57, 25], # "faculty of science requirements"
    [34, 6, 51, 17, 36, 32, 11], # "degree-specific requirements"
    [59, 30, 19], # "graduation"
    [39, 33, 35], # "academic leave, withdrawal, and readmission"
    [64, 10], # "degree logistics, study permits, distance education",
    [31, 3, 20, 18], # "academic concessions",
    [26, 8, 0, 38, 4, 56], # "course registration and withdrawal"
    [61, 16, 28, 40, 41], # "advising appointments and forms"
    [12, 45, 21, 14, 29, 43, 54], # degree options and planning
    [49, 24, 46, 58], # minors
    [42, 50, 5, 27], # "uncategorized"
    [44], # "co-op"
    [23, 7, 2, 22], # "transferring and transfer credits"
    [9], # "issues with degree navigator"
    [15], # "credit/d/fail"
]

q_model.merge_topics(questions, topics_to_merge)
update_topics()

In [None]:
topic_labels = {
    14: "transcripts, scholarships, rankings",
    1: "faculty of science requirements",
    3: "degree-specific requirements",
    8: "graduation",
    12: "academic leave, withdrawal, and readmission",
    9: "degree logistics, study permits, distance education",
    4: "academic concessions",
    0: "course registration and withdrawal",
    7: "advising appointments, forms, and appeals",
    5: "degree options and planning",
    11: "minors",
    6: "uncategorized",
    15: "co-op",
    2: "transferring and transfer credits",
    10: "issues with degree navigator",
    13: "credit/d/fail",
}

q_model.set_topic_labels(topic_labels)

In [None]:
if make_clusters:
    save_bertopic_model(q_model, "q_model_merged_2")

In [None]:
hierarchical_chart(q_model, questions, True)

In [None]:
clusters_bar_chart(q_model)

## Evaluate answer categories

In [None]:
a_model.visualize_topics()

In [None]:
hierarchical_chart(a_model, answers)

#### Merge answer categories

In [None]:
topics_to_merge = [
    [14, 15], # drop-in
    [81, 20], # applying for a minor
    [6, 4], # specialization application
    [74, 59, 7], # specific to degree-navigator
    [93, 61], # alternate format assessment
    [5, 10], # deferred exams
    [53, 30, 29, 32], # course withdrawal requests
    [84, 97], # course mode of delivery
    [50, 43], # credit limits
    [45, 18, 25], # graduation application / eligibility
    [57], # off-cycle promotion
    [49, 35, 33, 51, 41], # administrative, undergoing review, acknowledging receipt, etc.
    [88, 63, 77, 36], # eligibilities, sessional eval, year standing
    [54, 19], # registration dates and blocked registration
    [62, 8], # concessions and registration issues for CHEM courses
    [55, 28, 11, 13, 24, 67, 26, 69, 39, 38], # help with registration
    [37, 66], # credit specifics and requirement replacements
    [89, 98], # calculus 12 requirement
    [85, 82, 87], # refer to enrolment services
    [95, 90, 96], # refer to another faculty
    [71, 46], # registration webinar
    [14, 15], # resolved in drop-in
]

a_model.merge_topics(answers, topics_to_merge)
update_topics()

In [None]:
topic_labels = {
    34 : "removed from minor", # 47
    16 : "applying for a minor", # 81
    40 : "minor course change form", # 58 
    49 : "not permitted to repeat course for higher standing", # 73
    22 : "first year phys requirements", # 23
    18 : "upper-level credit requirements", # 16
    5 : "questions about BIOL courses", #2
    2 : "specialization application", # 6
    4 : "promotion requirements", # 1
    14 : "communication requirement", # 12
    6 : "transfer credit", # 3
    1 : "transferring faculty / admission", # 0
    26 : "second degree", # 31
    36 : "neuroscience program", # 52
    35 : "arts requirement", # 48
    24 : "credit specifics and requirement replacements", # 37
    11 : "degree navigator specifics",
    33 : "alternate format assessment",
    3 : "deferred exam request",
    25 : "academic concessions", # 27
    8 : "course drop/withdrawal request", # 53
    43 : "course conflict form", # 65
    45 : "switching course section", # 68
    44 : "course mode of delivery", # 84
    23 : "credit limits", 
    42 : "appeal for third attempt", # 64
    58 : "appeal received", # 91
    50 : "honours appeal", # 76
    32 : "honours requirements", # 44
    56 : "rankings, honour list, distinction", # 86
    7 : "graduation application / eligibility", #
    38 : "off-cycle promotion",
    9 : "administrative, undergoing review, acknowledging receipt, etc.",
    21 : "credit/d/fail", # 22
    59 : "CMS packages / requirements", # 94
    17 : "eligibilities, sessional eval, year standing",
    15 : "registration dates and blocked registration", 
    0 : "help with registration",
    46 : "calculus 12 requirement",
    30 : "refer to enrolment services",
    37 : "refer to another faculty", # 95, 90, 96
    48 : "advice for failed course", # 72
    28 : "changing specializtion", # 34
    39 : "admission averages", # 56
    31 : "readmission", # 42
    13 : "academic leave / inability to return to campus", # 9
    19 : "refer to counselling services", # 17
    20 : "appointments / virtual lines", # 21
    27 : "distillation emails / registration webinars", # 71
    55 : "go global", # 83
    52 : "co-op", # 80
    57 : "course load for full-time status", # 92
    54 : "requsting a letter", # 79
    47 : "requesting student number", # 70
    41 : "refer to another departent / faculty", # 60
    51 : "refer to arts advising", # 75
    53 : "refer to UBC-O advising", # 78
    10 : "resolved in drop-in",
    12 : "concessions and registration issues for CHEM courses",
    29 : "double / dual major"
}

a_model.set_topic_labels(topic_labels)

#### Visualize answer topics after merge

In [None]:
if make_clusters:
    save_bertopic_model(q_model, "a_model_merged")

In [None]:
hierarchical_chart(a_model, answers, True)

In [None]:
clusters_bar_chart(a_model, fontsize=9)

## Calculate Correlation
Now that we have created the question and answer categories, we want to see how well they line up. What is the correlation between a question category and an answer category?

In [None]:
# Load the questions model from the first merge step
q_model = load_bertopic_model("q_model_merged")
update_topics()

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

"""
Convert the categorical labels label_q and label_a in the df to onehot features,
then make a correlation matrix between all label_q and label_a classes
"""
def make_onehot_corr_df(df):
    """
    Convert the categorical labels label_q and label_a in the df to onehot features,
    then make a correlation matrix between all label_q and label_a classes
    """
    # onehot-encode the labels
    one_hot_feats = ["label_q", "label_a"]

    preprocessor = ColumnTransformer(
        transformers=[
            ("onehot", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), one_hot_feats),
        ],
        remainder='passthrough'
    )

    preprocessor.set_output(transform="pandas")

    onehot = preprocessor.fit_transform(df)
    
    # compute correlation and remove irrelevant columns
    onehot_corr = onehot.corr(numeric_only = True)
    a_cols = [col for col in onehot_corr.columns if 'label_a' in col]
    q_cols = [col for col in onehot_corr.columns if 'label_q' in col]
    onehot_corr = onehot_corr[a_cols]
    onehot_corr = onehot_corr.loc[q_cols]
    
    return onehot_corr

onehot_corr = make_onehot_corr_df(combine_clusters)

Plot the correlation matrix for any correlations over a threshold

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

threshold = 0.3
onehot_corr_filtered = onehot_corr[onehot_corr >= threshold]
plt.figure(figsize=(12,8))
sns.heatmap(onehot_corr_filtered, cmap="Greens")

### Create a summary of categories
We will create some summary files to give more insight into the categories

In [None]:
import regex as re

pairs_df = None

"""
Convert a row name from the onehot df to class id
"""
def rn_to_id(row_name):
    return int(re.search("\-?\d+", row_name).group())

"""
Pair each question category with the best answer category
"""
def find_best_corr_pairs(corr):            
    pairs = []
    
    for i, col_name in enumerate(corr.idxmax(axis=1)):
        row_name = corr.index[i]
        pairs.append((rn_to_id(row_name), rn_to_id(col_name), corr.loc[row_name, col_name]))
    
    return pairs

"""
Initialize a df summarizing question categories
"""
def init_qs_df():
    pairs = find_best_corr_pairs(onehot_corr)

    cat_pairs = []
    
    q_topic_df = q_model.get_topic_info()

    for q_cat, a_cat, corr in pairs:
        q_df = q_grouped.get_group(q_cat)
        fraction_matching_a = len(q_df[q_df["label_a"] == a_cat]) / len(q_df)
        a_label = a_model.custom_labels_[a_cat]
        cat_pairs.append({"q_cat": q_cat, 
                          "q_label": q_topic_df.iloc[q_cat]["CustomName"],
                          "a_cat": a_cat, 
                          "a_label": a_label,
                            "n_q": len(q_df), 
                          "fraction_q_with_a": fraction_matching_a, 
                          "corr": corr,
                          "sample_docs": q_topic_df.iloc[q_cat]["Representative_Docs"]})

    return pd.DataFrame(cat_pairs)
    
"""
Initialize a df summarizing answer categories
"""
def init_as_df():
    a_topic_df = a_model.get_topic_info()
    
    rows = []

    for a_cat in a_topic_df["Topic"]:
        a_df = a_grouped.get_group(a_cat)
        rows.append({"a_cat": a_cat,
                     "a_label": a_topic_df.iloc[a_cat]["CustomName"],
                     "n_a": len(a_df),
                     "sample_docs": a_topic_df.iloc[a_cat]["Representative_Docs"]})


    return pd.DataFrame(rows)
    
"""
Scores the correlations by weighted average of best correlation by number of questions
"""
def score_corr(qs_df):
    return np.average(qs_df["corr"], weights=qs_df["n_q"])
    
def save_df(df, name):
    df.to_csv(name, encoding=ENCODING)

In [None]:
qs_df = init_qs_df()
as_df = init_as_df()

In [None]:
as_df = pd.read_csv("answer_categories.csv", encoding=ENCODING, index_col = 0)

In [None]:
qs_df = pd.read_csv("question_categories.csv", encoding=ENCODING, index_col = 0)

In [None]:
qs_df.head()

We can calculate the weighted average of correlations per category for an overall metric

In [None]:
score_corr(qs_df)

## Category analysis

Collect the most common urls from each answer category, and add them to the summary file

In [None]:
from collections import Counter
import requests

# url regex from https://www.geeksforgeeks.org/python-check-url-string/
URL_REGEX = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"

def find_most_common_links(cats_df, by_q = True):
    """
    Finds the most common links in the answers for a given category
    Can group by a question category or an answer category
    """
    
    for i, cat in enumerate(cats_df["q_cat" if by_q else "a_cat"]):
        # regex match every url mentioned in the answers of this category
        group_df = q_grouped.get_group(cat) if by_q else a_grouped.get_group(cat)
        urls = list(group_df["text_a"].str.extractall(URL_REGEX)[0])
        
        # resolve redirects and find most common urls
        final_counts = {}

        for url, count in Counter(urls).most_common(20):
            url = url if url.startswith('http') else ('http://' + url)
            try:
                r = requests.get(url)
            except:
                # ignore invalid url
                continue
                
            if r.url in final_counts:
                final_counts[r.url] += count
            else:
                final_counts[r.url] = count
            
        cats_df.loc[cats_df.index[i], "common_urls"] = [Counter(final_counts).most_common(10)]

In [None]:
find_most_common_links(as_df, False)

In [None]:
find_most_common_links(qs_df, True)

In [None]:
# Save the summary files
save_df(as_df, out_path_answer_clusters)
save_df(qs_df, out_path_question_clusters)

### Use an LLM to generate an FAQ
This is an experimental section, and it didn't generate very promising results at the moment, but could be improved. The idea is to use the representative documents for a topic and pass them through an LLM to generate an FAQ for each question category.

In [None]:
%%capture
!pip install langchain boto3 tiktoken

In [None]:
from langchain.llms import SagemakerEndpoint
from langchain.llms.sagemaker_endpoint import LLMContentHandler
from typing import Dict
import json

endpoint_name = "meta-textgeneration-llama-2-7b-f-2023-10-29-21-47-29-374"

class ContentHandler(LLMContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, prompt: str, model_kwargs: dict) -> bytes:
        payload = {
            "inputs": [
                [
                    {"role": "user", "content": prompt},
                ],
                ],
                "parameters": {"max_new_tokens": 1000, "top_p": 0.6, "temperature": 0.1},
        }

        input_str = json.dumps(
            payload,
        )
        
        return input_str.encode("utf-8")

    def transform_output(self, output: bytes) -> str:
        response_json = json.loads(output.read().decode("utf-8"))
        content = response_json[0]["generation"]["content"]
        return content


content_handler = ContentHandler()

llm=SagemakerEndpoint(
    endpoint_name=endpoint_name,
    #credentials_profile_name="credentials-profile-name",
    region_name="us-west-2",
    model_kwargs={"temperature": 1e-10},
    endpoint_kwargs={"CustomAttributes": 'accept_eula=true'},
    content_handler=content_handler,
)

In [None]:
from langchain.docstore.document import Document
from langchain.chains.summarize import load_summarize_chain
from langchain.chains.mapreduce import MapReduceChain
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain, LLMChain, StuffDocumentsChain

# https://python.langchain.com/docs/use_cases/summarization
max_samples = 10
chain = load_summarize_chain(llm, chain_type="map_reduce")

map_prompt_template = """
                      Summarize the following text contains one or more questions and answers. Write the key questions and their corresponding answers as bullet points in the following format:
                      - Q: ...
                        A: ...
                      - Q: ...
                        A: ...
                      Do not include any other text.
                      TEXT:
                      {text}
                      """

map_prompt = PromptTemplate(template=map_prompt_template, input_variables=["text"])

combine_prompt_template = """
                      Write a concise FAQ from the following list of questions and answers.
                      Combine similar questions into one bullet point, and ignore questions about a particular student / course / year.
                      Return your response in bullet points which covers the most common general questions and answers in the following format:
                      - Q: ...
                        A: ...
                      - Q: ...
                        A: ...
                      ```{text}```
                      BULLET POINT SUMMARY:
                      """

combine_prompt = PromptTemplate(
    template=combine_prompt_template, input_variables=["text"]
)

map_reduce_chain = load_summarize_chain(
    llm,
    chain_type="map_reduce",
    map_prompt=map_prompt,
    combine_prompt=combine_prompt,
    return_intermediate_steps=False,
)

def summarize_category(q_cat, a_cat):
    q_df = q_grouped.get_group(q_cat)
    qa_df = q_df.groupby("label_a").get_group(a_cat)
    
    sample = qa_df.sample(max_samples)
    
    sample_texts = []
        
    for q, a in zip(sample["text_q"], sample["text_a"]):
        sample_texts.append(f"Question: {q}\n\nAnswer: {a}")
    
    docs = [Document(page_content=text) for text in sample_texts]
                    
    return map_reduce_chain.run(docs)

def make_summaries(qs_df):
    for i, (q_cat, a_cat) in enumerate(zip(qs_df["q_cat"], qs_df["a_cat"])):
        summary = summarize_category(q_cat, a_cat)
        print(summary)
    
        qs_df.loc[pairs_df.index[i], "summary"] = summary

In [None]:
make_summaries(qs_df[0:1])