In [1]:
import pandas as pd

# Set the display options to show all rows and columns
pd.set_option('display.max_rows', None)  # None means show all rows
pd.set_option('display.max_columns', None)  # None means show all columns

In [2]:
import pickle
import json
from collections import Counter
import numpy as np
import pandas as pd

In [3]:
with open('new_course_metadata.json') as f:
    course_metadata = json.load(f)

# Open the file for reading
with open('embeddings64_2023-12-09.pkl', 'rb') as f:
    # Unpickle the embeddings
    course_embeddings = pickle.load(f)

courses = list(course_embeddings.keys()) # We use the keys from `embeddings` and not course_metadata because course_metadata has some `phantom courses` (courses that don't have metadata) 
print(len(courses))
print(course_metadata['CS224V'].keys())

4154
dict_keys(['description', 'subject', 'gers', 'units', 'code', 'days', 'titleSearch', 'title', 'quartersOffered'])


In [4]:
course_metadata['CS224V']['title']

'Conversational Virtual Assistants with Deep Learning'

**Course Subject Distrubtion - FULL DIMENSION SIZE**

In [None]:
#labels = [course_metadata[course]['subject'] for course in courses]


subjects = []
embeddings_list = []
titles = []
for course in courses:
    subject = course_metadata[course]['subject']
    title = course_metadata[course]['title']
    embeddings_list.append(course_embeddings[course]['embedding'][0]['embedding'])
    subjects.append(subject)
    titles.append(title)

subjects_counter = Counter(subjects)
print("Number of unique subjects:", len(subjects_counter))
subjects_counter

**Course Subject Distribution - REDUCED DIMENSION SIZE**

In [4]:
subjects = []
embeddings_list = []
titles = []
for course in courses:
    subject = course_metadata[course]['subject']
    title = course_metadata[course]['title']
    embeddings_list.append(course_embeddings[course])
    subjects.append(subject)
    titles.append(title)

subjects_counter = Counter(subjects)
print("Number of unique subjects:", len(subjects_counter))
subjects_counter

Number of unique subjects: 179


Counter({'CS': 227,
         'PWR': 145,
         'HISTORY': 145,
         'MUSIC': 128,
         'ENGLISH': 110,
         'CEE': 103,
         'EE': 95,
         'POLISCI': 90,
         'MS&E': 83,
         'BIO': 83,
         'PHIL': 80,
         'MATH': 79,
         'EARTHSYS': 75,
         'CLASSICS': 74,
         'PSYCH': 70,
         'EDUC': 68,
         'ME': 65,
         'ECON': 62,
         'SOC': 57,
         'PHYSICS': 57,
         'HUMBIO': 53,
         'ENGR': 53,
         'SPECLANG': 52,
         'CHINLANG': 49,
         'ARTSTUDI': 48,
         'TAPS': 46,
         'CSRE': 45,
         'PHYSWELL': 45,
         'FEMGEN': 44,
         'INTNLREL': 43,
         'BIOE': 42,
         'STATS': 41,
         'COMM': 40,
         'PUBLPOL': 39,
         'ANTHRO': 36,
         'AA': 35,
         'CHEM': 35,
         'ARTHIST': 35,
         'COMPLIT': 34,
         'UAR': 34,
         'AMSTUD': 34,
         'PSYC': 33,
         'URBANST': 31,
         'INTLPOL': 31,
         'DESIGN'

# Clustering

In [5]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
import numpy as np

def get_clustering_model(embeddings_list, distance_threshold=0.1, metric='cosine', linkage='average', standardize=False):
    """
    Generate and fit an AgglomerativeClustering model using the given parameters.

    Parameters:
    - embeddings_list (list of arrays): List containing embeddings to be clustered.
    - distance_threshold (float, optional): The linkage distance threshold above which clusters won't be merged. Defaults to 0.1.
    - metric (str, optional): The distance metric to use for the clustering. Allowed values include 'euclidean', 'l1', 'l2', 'manhattan', 'cosine', and 'precomputed'. Defaults to 'cosine'.
    - linkage (str, optional): The linkage criterion to use. The linkage criterion determines which distance to use between sets of observations. Allowed values are 'ward', 'complete', 'average', and 'single'. Defaults to 'average'.
    - standardize (bool, optional): Whether to standardize the embeddings before clustering. If set to True, the embeddings are standardized to have a mean of 0 and a standard deviation of 1. Defaults to False.
    
    Returns:
    - model (AgglomerativeClustering): The trained clustering model with the specified parameters.
    - embeddings (numpy array): The embeddings used for clustering. If standardize=True, this will return the standardized embeddings.
    """

    # Convert list of embeddings to a matrix
    embeddings_list = np.vstack(embeddings_list)
    
    # If standardization is set to True, standardize the embeddings
    if standardize:
        scaler = StandardScaler()
        embeddings_list = scaler.fit_transform(embeddings_list)

    # Initialize and configure the AgglomerativeClustering model
    model = AgglomerativeClustering(
        distance_threshold=distance_threshold,
        metric=metric,
        n_clusters=None,
        linkage=linkage
    )

    # Fit the model to the embeddings
    model.fit(embeddings_list)

    return model, embeddings_list


**Clustering Settings**

In [6]:
distance_threshold = 3
linkage = "average"
metric = 'euclidean'
standardize = False

In [169]:
model, embeddings = get_clustering_model(embeddings_list=embeddings_list, 
                                         distance_threshold=distance_threshold,
                                         metric=metric,
                                         linkage=linkage,
                                         standardize=standardize)

In [170]:
# Cluster labels

df = pd.DataFrame({
    "Course": courses,
    "subject" : subjects,
    "Group" : model.labels_
})

print(len(model.labels_))

4154


In [171]:
print("Number of")
print(len(set(model.labels_)))

Number of
1


**Let's do binary search to get a good distance threshold**

In [7]:
distance_dict = {}
metric = 'euclidean' #'cosine'
linkage = "ward" #"average"
standardize = False

In [8]:
def binary_search_for_optimal_distance(low, high, target_clusters, tolerance=3, max_iter=20, similar_distance_tolerance=1e-5):
    """
    Performs a binary search to find the optimal distance value for agglomerative clustering.
    Stores the number of clusters and model for each distance tried in a dictionary.
    If a distance is within a 1e-5 delta of a previously calculated distance, it reuses the stored model.

    :param low: Lower bound of the distance value.
    :param high: Upper bound of the distance value.
    :param target_clusters: Target number of clusters to find.
    :param tolerance: Tolerance for the difference in the number of clusters.
    :param max_iter: Maximum number of iterations to prevent infinite loops.
    :return: Optimal distance value along with the model.
    """
    iterations = 0
    while low <= high and iterations < max_iter:
        mid = round((low + high) / 2.0, 6)
        print(f"{iterations} - Trying:", mid)
        
        # Check if the mid value or a close value has been computed before
        close_distance = None
        for d in distance_dict:
            if abs(d - mid) < similar_distance_tolerance:
                close_distance = d
                break
        
        if close_distance is not None:
            print("Using stored model for distance:", close_distance)
            model = distance_dict[close_distance]['model']
            num_clusters = distance_dict[close_distance]['num_clusters']
        else:
            print("Calculating model for distance!")
            model, embeddings = get_clustering_model(embeddings_list=embeddings_list, 
                                                     distance_threshold=mid,
                                                     metric=metric,
                                                     linkage=linkage,
                                                     standardize=standardize)
            
            num_clusters = len(set(model.labels_))
            
            # Store the model and number of clusters in the dictionary
            distance_dict[mid] = {'model': model, 'num_clusters': num_clusters}
        
        print("Num of clusters:", num_clusters)
        if abs(num_clusters - target_clusters) <= tolerance:
            print("Found good distance! Num of clusters:", num_clusters, "\nDistance:", mid, "\n")
            return model, mid  # Found a suitable distance
        elif num_clusters < target_clusters:
            high = mid
        else:
            low = mid
        iterations += 1
    return None, None  # Return None if no suitable distance is found within the max iterations

# We can now call this function with the user's specified bounds.
#model, optimal_distance = binary_search_for_optimal_distance(0.01, 0.3, 30)

### For Metric = Cosine Similarity

In [20]:
# We can now call this function with the user's specified bounds.
model, optimal_distance = binary_search_for_optimal_distance(0.01, 0.3, 30)

0 - Trying: 0.155
Calculating model for distance!


ValueError: cosine was provided as metric. Ward can only work with euclidean distances.

### For Metric = Euclidean Distance

In [9]:
# We can now call this function with the user's specified bounds.
model, optimal_distance = binary_search_for_optimal_distance(0.001, 3, 20, tolerance=2)

0 - Trying: 1.5005
Calculating model for distance!
Num of clusters: 35
1 - Trying: 2.25025
Calculating model for distance!
Num of clusters: 16
2 - Trying: 1.875375
Calculating model for distance!
Num of clusters: 24
3 - Trying: 2.062812
Calculating model for distance!
Num of clusters: 20
Found good distance! Num of clusters: 20 
Distance: 2.062812 



## Making the dataframe

In [10]:
# Cluster labels

df = pd.DataFrame({
    "Course": courses,
    "Subject" : subjects,
    "Title" : titles,
    "Group" : model.labels_
})
print(f"Distance: {optimal_distance}; \tLinkage: {linkage}; \tMetric: {metric}; \nClusters: {model.n_clusters_}")
df.groupby("Group").Course.count()

Distance: 2.062812; 	Linkage: ward; 	Metric: euclidean; 
Clusters: 20


Group
0     281
1     536
2     407
3     174
4     165
5     281
6     243
7     238
8     269
9      78
10    128
11    195
12    191
13    185
14     25
15    171
16    184
17    187
18    156
19     60
Name: Course, dtype: int64

In [14]:
df[df.Group == 14]

Unnamed: 0,Course,Subject,Title,Group
299,UAR101P,UAR,Frosh 101: Ujamaa (Redwood),14
421,UAR194A,UAR,Frosh 101 and Transfer 101: Leader Training,14
464,UAR101U,UAR,Frosh 101: Castaño (Wisteria),14
578,UAR101A,UAR,Frosh 101: Burbank (Aspen),14
689,UAR101D,UAR,Frosh 101: Larkin (Aspen),14
1001,UAR101L,UAR,Frosh 101: Schiff (Magnolia),14
1248,UAR101H,UAR,Frosh 101: Rinconada (Hyperion),14
1342,UAR101T,UAR,Frosh 101: Okada (Sequoia),14
1428,UAR101S,UAR,Frosh 101: Junipero (Sequoia),14
1726,UAR101N,UAR,Frosh 101: Muwekma-Tah-Ruk (Olive),14


BAD ATTEMPT:

In [None]:
'''# Now, let's export each group to a separate sheet in an Excel workbook
with pd.ExcelWriter('groups.xlsx') as writer:
    for group in sorted(df['Group'].unique()):
        group_df = df[df['Group'] == group]
        group_df.to_excel(writer, sheet_name=f'Group_{group}', index=False)

    # Additionally, create a single sheet with all entries, separated by groups with 4 empty rows apart
    all_groups_df = pd.DataFrame()
    for group in sorted(df['Group'].unique()):
        group_df = df[df['Group'] == group]
        all_groups_df = all_groups_df.append(group_df, ignore_index=True)
        # Add 4 empty rows after each group
        all_groups_df = all_groups_df.append(pd.DataFrame([['']*len(df.columns)] * 4, columns=df.columns), ignore_index=True)

    all_groups_df.to_excel(writer, sheet_name='All_Groups', index=False)'''

## Exporting an excel file containing each course by group

In [15]:
import pandas as pd

# Ensure you have your DataFrame 'df' loaded correctly here
# For example:
# df = pd.read_csv('your_data.csv')  # or however you are creating your DataFrame

# Now, let's export each group to a separate sheet in an Excel workbook
with pd.ExcelWriter('groups.xlsx') as writer:
    for group in sorted(df['Group'].unique()):
        group_df = df[df['Group'] == group]
        group_df.to_excel(writer, sheet_name=f'Group_{group}', index=False)

    # Additionally, create a single sheet with all entries, separated by groups with 4 empty rows apart
    all_groups_df = pd.DataFrame()
    for group in sorted(df['Group'].unique()):
        group_df = df[df['Group'] == group]
        all_groups_df = pd.concat([all_groups_df, group_df], ignore_index=True)
        # Add 4 empty rows after each group
        empty_rows = pd.DataFrame([['']*len(df.columns)] * 4, columns=df.columns)
        all_groups_df = pd.concat([all_groups_df, empty_rows], ignore_index=True)

    all_groups_df.to_excel(writer, sheet_name='All_Groups', index=False)


: 

In [46]:
repr(df).groupby

AttributeError: 'str' object has no attribute 'groupby'

In [45]:
print("Frequency by Cluster:")
df.groupby("Group").Course.count()

Frequency by Cluster:


Group
0     134
1      95
2     120
3     103
4      85
5      86
6     135
7      63
8      83
9      91
10    128
11     98
12    107
13    117
14     65
15     77
16     73
17     88
18     65
19     78
20     29
21     57
22     76
23     93
24     52
25     70
26     87
27     45
28    113
29     75
30     75
31     72
32     46
33     83
34     40
35     96
36     28
37     65
38     73
39     37
40     60
41     60
42     47
43     34
44     61
45     68
46     44
47     38
48     39
49     10
50     67
51     59
52     27
53     67
54     22
55     36
56     34
57     55
58     48
59     25
60     50
Name: Course, dtype: int64

In [None]:
optimal_distance

In [None]:
for course in courses:
    embeddings[course]