In [1]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-distilroberta-base-v1')

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()  # for plot styling
import umap


ModuleNotFoundError: No module named 'sentence_transformers'

In [11]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-distilroberta-base-v1')
sentences = ['This framework generates embeddings for each input sentence, and I love badeed who needs weed. Badeed loves weed.',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']
sentence_embeddings = model.encode(sentences)

print("Sentence embeddings:")
print(sentence_embeddings)

Sentence embeddings:
[[-0.06622078  0.64393383 -0.07833593 ...  0.25022     0.34524366
  -0.15415968]
 [-0.25747904  0.24629407  0.09469662 ... -0.14210282  0.14410551
   0.51566   ]
 [ 0.05094916  0.24235822  0.15774004 ... -0.20625325 -0.08491809
   0.06364132]]


In [12]:
sentence_embeddings.shape

(3, 768)

## Import Data

In [None]:
df = pd.read_csv("Module 7 Reflection Survey Student Analysis Report.csv")
df.head(5)

## Generate embeddings
First, combine all the student responses into a single list, then provide the sentences to the embeddings model.

In [None]:
chal = []

for challenge in df['4134312: What was your biggest challenge this past week? This can include in-class activities, assignments, prep work, studying, time management, motivation, and so on.']:
    if str(challenge) != "nan":
        sent = str(challenge).replace("\xa0", "")
        chal.append(sent)
print(chal)

In [None]:
#provide sentences to model

sentence_embeddings = model.encode(chal)

In [None]:
for sentence, embedding in zip(chal, sentence_embeddings):
    print("Sentence: ", sentence)
    print("Embedding: ", embedding)
    print("")

## K-Means Clustering

In [None]:
from sklearn.cluster import KMeans

In [None]:
#This is used to calculate the cluster distances of each student response
def k_mean_distance(data, cx, cy, i_centroid, cluster_labels):
        distances = [np.sqrt((x-cx)**2+(y-cy)**2) for (x, y) in data[cluster_labels == i_centroid]]
        return distances


In [None]:
#Silhouette score is used to calculate the optimal number of clusters a corpus should have
from sklearn.metrics import silhouette_score
def silhouette(corpus):
    sil = []
    kmax = 10


    for k in range(2, kmax+1):
      kmeans = KMeans(n_clusters = k).fit_predict(corpus)
      labels = kmeans
      sil.append(silhouette_score(corpus, labels, metric = 'euclidean'))
    print(sil)
    return sil.index(max(sil)) + 2

In [None]:
optimal_k = silhouette(sentence_embeddings)
print(optimal_k)

In [None]:
#Method for generating the k-means clusters.
def kmean_cluster(num_clusters, corpus, embeddings):
    clustering_model = KMeans(n_clusters=num_clusters)
    
    y_pred = clustering_model.fit_predict(embeddings)
    distances = clustering_model.fit_transform(embeddings)
    cluster_assignment = clustering_model.labels_
    
    centroids = clustering_model.cluster_centers_

    # clustering_model.predict(embeddings)
    
    
    clustered_sentences = [[] for i in range(num_clusters)]
    
    for sentence_id, cluster_id in enumerate(cluster_assignment):
        clustered_sentences[cluster_id].append(corpus[sentence_id])
    
    reflection = []
    cluster2 = []
    for i, cluster in enumerate(clustered_sentences):
        print("_______________Cluster ", i, "___________________")
        for sent in cluster:
            print(sent, "\n")
            reflection.append(sent)
            cluster2.append("cluster " + str(i))
        print("")
    distances = np.min(distances, axis=1)
    
    
    
    #Download a spreadsheet of the generated clusters
    df = {"student_reflection": reflection, "cluster #": cluster2, "distance": distances}
    df = pd.DataFrame().from_dict(df)
    df.to_csv("kmeanscluster_withdistance_mod7.csv")
    
    
    
    return y_pred

In [None]:
y_pred = kmean_cluster(optimal_k, chal, sentence_embeddings)

## Code after this point is for generating a plot (Work in progress, feel free to ignore)
Uses UMAP to reduce dimensionality

In [None]:
print(y_pred)

In [None]:
umap_reducer = umap.UMAP()
reduced_dim = umap_reducer.fit_transform(sentence_embeddings)
reduced_dim.shape


In [None]:
palette = sns.color_palette()[:6]
sns.set(rc={'figure.figsize':(16,12)})
sns.scatterplot(reduced_dim[:,0], reduced_dim[:,1], 
                hue=y_pred, palette=palette, s=80)
plt.ylim(-14,0)
plt.xlim(0,14)
plt.title('Module 6 - Reflections K-Means Clustering')
plt.show()


## Module 7

In [None]:
df = pd.read_csv("Module 7 Reflection Survey Student Analysis Report.csv")
chal = []

for challenge in df['4134312: What was your biggest challenge this past week? This can include in-class activities, assignments, prep work, studying, time management, motivation, and so on.']:
    if str(challenge) != "nan":
        sent = str(challenge).replace("\xa0", "")
        chal.append(sent)
#print(chal)

In [None]:
sentence_embeddings = model.encode(chal)

In [None]:
for sentence, embedding in zip(chal, sentence_embeddings):
    print("Sentence: ", sentence)
    print("Embedding: ", embedding)
    print("")

In [None]:
optimal_k = silhouette(sentence_embeddings)
print(optimal_k)

In [None]:
y_pred = kmean_cluster(optimal_k, chal, sentence_embeddings)

In [None]:
umap_reducer = umap.UMAP()
reduced_dim = umap_reducer.fit_transform(sentence_embeddings)
reduced_dim.shape


In [None]:
palette = sns.color_palette()[:8]
sns.set(rc={'figure.figsize':(16,12)})
sns.scatterplot(reduced_dim[:,0], reduced_dim[:,1], 
                hue=y_pred, palette=palette, s=80)
plt.ylim(0,14)
plt.xlim(0,14)
plt.title('Module 7 - Reflections K-Means Clustering')
plt.show()
