## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import sqlite3
import spacy
from scipy.spatial.distance import euclidean
from sklearn.cluster import MiniBatchKMeans 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from collections import Counter
from dotenv import load_dotenv
load_dotenv()

# Load the SpaCy German language model
# nlp = spacy.load("de_core_news_sm")
nlp = spacy.load("de_core_news_lg")

In [None]:
def german_stopwords_plain():

  # Specify the path to your text file
  file_path = './german_stopwords_plain.txt'

  # Open the file in read mode
  with open(file_path, 'r') as file:
      # Read the content of the file
      lines = file.readlines()

      # Filter out lines starting with ';'
      filtered_lines = [line.strip() for line in lines if not line.startswith(';')]

      # Concatenate the non-comment lines into a single string
      content = '\n'.join(filtered_lines)

      # Split the content into words
      words = content.split()

  return words

In [None]:
def german_stopwords_full():

  # Specify the path to your text file
  file_path = './german_stopwords_full.txt'

  # Open the file in read mode
  with open(file_path, 'r') as file:
      # Read the content of the file
      lines = file.readlines()

      # Filter out lines starting with ';'
      filtered_lines = [line.strip() for line in lines if not line.startswith(';')]

      # Concatenate the non-comment lines into a single string
      content = '\n'.join(filtered_lines)

      # Split the content into words
      words = content.split()

  return words

## Functions - Regulated

### Optimal Clusters - Regulated

In [None]:
def find_optimal_clusters_regulated(data, max_k):
    iters = range(2, max_k+1, 2)
    
    sse = []
    for k in iters:
        sse.append(MiniBatchKMeans(n_clusters=k, init_size=512, batch_size=1024, random_state=500).fit(data).inertia_)
        print('Fit {} clusters'.format(k))
        
    f, ax = plt.subplots(1, 1)
    ax.plot(iters, sse, marker='o')
    ax.set_xlabel('Cluster Centers')
    ax.set_xticks(iters)
    ax.set_xticklabels(iters)
    ax.set_ylabel('SSE')
    ax.set_title('SSE by Cluster Center Plot')

### PCA - TSNE - Regulated

In [None]:
def plot_tsne_pca_regulated(data, labels):
    max_label = max(labels)
    max_items = np.random.choice(range(data.shape[0]), size=10000, replace=False)

    data_array = np.asarray(data[max_items, :].todense())

    pca = PCA(n_components=2).fit_transform(data_array)
    tsne = TSNE(perplexity=50).fit_transform(pca)
    
    
    idx = np.random.choice(range(pca.shape[0]), size=100, replace=False)
    # label_subset = labels[max_items]
    # label_subset = [cm.hsv(i/max_label) for i in label_subset[idx]]
    label_subset = labels[max_items][idx]
    unique_labels = np.unique(label_subset)
    colors = [cm.hsv(i/max_label) for i in range(max_label+1)]
    
    f, ax = plt.subplots(1, 2, figsize=(14, 6))
    
    for i in unique_labels:
        ix = np.where(label_subset == i)
        ax[0].scatter(pca[idx, 0][ix], pca[idx, 1][ix], c=[colors[i]], label=i)
    ax[0].set_title('PCA Cluster Plot')
    ax[0].legend()
    
    for i in unique_labels:
        ix = np.where(label_subset == i)
        ax[1].scatter(tsne[idx, 0][ix], tsne[idx, 1][ix], c=[colors[i]], label=i)
    ax[1].set_title('TSNE Cluster Plot')
    ax[1].legend()

    plt.show()


## Functions - Unregulated

### Optimal Clusters - Unregulated

In [None]:
def find_optimal_clusters_unregulated(data, max_k):
    iters = range(2, max_k+1, 2)
    
    sse = []
    for k in iters:
        sse.append(MiniBatchKMeans(n_clusters=k, init_size=1024, batch_size=2048, random_state=1000).fit(data).inertia_)
        print('Fit {} clusters'.format(k))
        
    f, ax = plt.subplots(1, 1)
    ax.plot(iters, sse, marker='o')
    ax.set_xlabel('Cluster Centers')
    ax.set_xticks(iters)
    ax.set_xticklabels(iters)
    ax.set_ylabel('SSE')
    ax.set_title('SSE by Cluster Center Plot')


### PCA - TSNE - Unregulated

In [None]:
def plot_tsne_pca_unregulated(data, labels):
    max_label = max(labels)
    max_items = np.random.choice(range(data.shape[0]), size=60000, replace=False)

    data_array = np.asarray(data[max_items, :].todense())

    pca = PCA(n_components=2).fit_transform(data_array)
    tsne = TSNE(perplexity=50).fit_transform(pca)
    
    
    idx = np.random.choice(range(pca.shape[0]), size=10000, replace=False)
    # label_subset = labels[max_items]
    # label_subset = [cm.hsv(i/max_label) for i in label_subset[idx]]
    label_subset = labels[max_items][idx]
    unique_labels = np.unique(label_subset)
    colors = [cm.hsv(i/max_label) for i in range(max_label+1)]
    
    f, ax = plt.subplots(1, 2, figsize=(14, 6))
    
    for i in unique_labels:
        ix = np.where(label_subset == i)
        ax[0].scatter(pca[idx, 0][ix], pca[idx, 1][ix], c=[colors[i]], label=i)
    ax[0].set_title('PCA Cluster Plot')
    ax[0].legend()
    
    for i in unique_labels:
        ix = np.where(label_subset == i)
        ax[1].scatter(tsne[idx, 0][ix], tsne[idx, 1][ix], c=[colors[i]], label=i)
    ax[1].set_title('TSNE Cluster Plot')
    ax[1].legend()

    plt.show()
    


## DB Connection and Retrieval

In [None]:
sqlite_conn = sqlite3.connect('weiterbildung_new_data.db')

df = pd.read_sql_query("SELECT angebot_id, angebot_titel, angebot_inhalt, bildungsart_bezeichnung FROM weiterbildung_data", sqlite_conn)
df = df.drop_duplicates(subset=['angebot_id'])
df


### Types of Education | bildungsart_bezeichnung

In [None]:
educationTypes = df.groupby(['bildungsart_bezeichnung']).bildungsart_bezeichnung.value_counts()
educationTypes.plot.barh()

### Filter Regulated DF

In [None]:
df_regulated = df[df["bildungsart_bezeichnung"].str.contains("Gesetzlich/gesetzes√§hnlich geregelte Fortbildung/Qualifizierung")]
df_regulated

### Filter Unregulated DF

In [None]:
df_unregulated = df[df["bildungsart_bezeichnung"] == "Fortbildung/Qualifizierung"]
# df_unregulated = df_unregulated.iloc[:8000]
df_unregulated

## Lemmatizer

In [None]:
# Function to lemmatize text using SpaCy
def lemmatize_text(text):
    doc = nlp(text)
    lemmatized_text = " ".join([token.lemma_ for token in doc])
    return lemmatized_text

In [None]:
# Apply lemmatization to the regulated dataset
df_regulated['lemmatized_text'] = df_regulated['angebot_inhalt'].apply(lemmatize_text)
df_regulated

In [None]:
# Apply lemmatization to the unregulated dataset
df_unregulated['lemmatized_text'] = df_unregulated['angebot_inhalt'].apply(lemmatize_text)
df_unregulated

## Get Top Keywords

In [None]:
def get_top_keywords(data, clusters, labels, n_terms):
    df = pd.DataFrame(data.todense()).groupby(clusters).mean()
    
    for i,r in df.iterrows():
        print('\nCluster {}'.format(i))
        print(','.join([labels[t] for t in np.argsort(r)[-n_terms:]]))

## Execution - Regulated

### TFID Vectorizer - Regulated

In [None]:
tfidf = TfidfVectorizer(
    min_df = 500,
    max_df = 0.95,
    max_features = 3000,
    stop_words=german_stopwords_full()
)

text_regulated = tfidf.fit_transform(df_regulated.lemmatized_text)

find_optimal_clusters_regulated(text_regulated, 30)

### PCA and TSNE  - Regulated

In [None]:
clusters_regulated = MiniBatchKMeans(n_clusters=10, n_init='auto', random_state=0).fit(text_regulated)

test_regulated = clusters_regulated.predict(text_regulated)

plot_tsne_pca_regulated(text_regulated, test_regulated)
get_top_keywords(text_regulated, test_regulated, tfidf.get_feature_names_out(), 10)

## Execution - Unregulated

### TFID Vectorizer - Unregulated

In [None]:
tfidf_unregulated = TfidfVectorizer(
    min_df = 500,
    max_df = 0.95,
    max_features = 201,
    stop_words=german_stopwords_full()
)

text_unregulated = tfidf_unregulated.fit_transform(df_unregulated.lemmatized_text)

find_optimal_clusters_regulated(text_unregulated, 30)

### PCA and TSNE - Unregulated

In [None]:
clusters_unregulated = MiniBatchKMeans(n_clusters=8, n_init='auto', random_state=0).fit_predict(text_unregulated)

plot_tsne_pca_unregulated(text_unregulated, clusters_unregulated)

get_top_keywords(text_unregulated, clusters_unregulated, tfidf_unregulated.get_feature_names_out(), 10)

# Using Distancing 

### Generating MiniBatchKMeans

In [None]:
reg_clusters = MiniBatchKMeans(n_clusters=10, n_init='auto', random_state=0).fit(text_regulated)
reg_clusters_words = reg_clusters.predict(text_regulated)

unreg_clusters = MiniBatchKMeans(n_clusters=8,n_init='auto', random_state=0).fit(text_unregulated)
unreg_clusters_words = unreg_clusters.predict(text_unregulated)

### Acquiring Cluster Centers

In [None]:
reg_cluster_center = reg_clusters.cluster_centers_
unreg_cluster_center = unreg_clusters.cluster_centers_

### Calculating Euclidean Distance

In [None]:
final_stats = []

for i in range(unreg_cluster_center.shape[0]):
  cluster_to_cluster_centroid_distance = []
  for j in range(reg_cluster_center.shape[0]):
    centroid_distance = euclidean(unreg_cluster_center[i], reg_cluster_center[j])
    cluster_to_cluster_centroid_distance.append((i,j,centroid_distance))
  min_stats = min(cluster_to_cluster_centroid_distance, key=lambda x: x[2])
  final_stats.append(min_stats)
final_stats

### Getting Top Keywords

In [None]:
get_top_keywords(text_regulated, reg_clusters_words, tfidf.get_feature_names_out(), 10)
get_top_keywords(text_unregulated, unreg_clusters_words, tfidf_unregulated.get_feature_names_out(), 10)

### PCA - TSNE Plotting (Regualted and Unregulated)

In [None]:
max_label_reg = max(reg_clusters_words)
max_label_unreg = max(unreg_clusters_words)

max_itemsReg = np.random.choice(range(text_regulated.shape[0]), size=10000, replace=False)
max_itemsUnreg = np.random.choice(range(text_unregulated.shape[0]), size=1000, replace=False)

data_arrayReg = np.asarray(text_regulated[max_itemsReg, :].todense())
data_arrayUnreg = np.asarray(text_unregulated[max_itemsUnreg, :].todense())

# pca = PCA(n_components=2).fit_transform(data_array)
Reg_pca = PCA(n_components=2).fit_transform(data_arrayReg)
Unreg_pca = PCA(n_components=2).fit_transform(data_arrayUnreg)
Reg_tsne = TSNE(n_components=2, perplexity=50).fit_transform(data_arrayReg)
Unreg_tsne = TSNE(n_components=2, perplexity=50).fit_transform(data_arrayUnreg)

idx_reg = np.random.choice(range(Reg_pca.shape[0]), size=10000, replace=False)
label_subset_reg = reg_clusters_words[max_itemsReg]
label_subset_reg = [cm.hsv(i/max_label_reg) for i in label_subset_reg[idx_reg]]

idx_unreg = np.random.choice(range(Unreg_pca.shape[0]), size=1000, replace=False)
label_subset_unreg = unreg_clusters_words[max_itemsUnreg]
label_subset_unreg = [cm.hsv(i/max_label_unreg) for i in label_subset_unreg[idx_unreg]]


In [None]:
# Plotting
plt.figure(figsize=(16, 8))

# Function to generate legend items for a selection of clusters
def generate_legend_items(num_clusters):
    return [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=cm.hsv(i/num_clusters), markersize=10, label=f'Cluster {i}') for i in range(num_clusters)]

# PCA for dataset X
plt.subplot(2, 2, 1)
plt.scatter(Reg_pca[:, 0], Reg_pca[:, 1], c=label_subset_reg, cmap='viridis', alpha=0.5)
plt.scatter(reg_clusters.cluster_centers_[:, 0], reg_clusters.cluster_centers_[:, 1], marker='x', color='red', s=100)
plt.title('PCA for Regulated CVET')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(handles=generate_legend_items(max_label_reg + 1))

plt.subplot(2, 2, 2)
plt.scatter(Reg_tsne[:, 0], Reg_tsne[:, 1], c=label_subset_reg, cmap='viridis', alpha=0.5)
plt.scatter(reg_clusters.cluster_centers_[:, 0], reg_clusters.cluster_centers_[:, 1], marker='x', color='red', s=100)
plt.title('t-SNE for Regulated CVET')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.legend(handles=generate_legend_items(max_label_reg + 1))

plt.subplot(2, 2, 3)
plt.scatter(Unreg_pca[:, 0], Unreg_pca[:, 1], c=label_subset_unreg, cmap='viridis', alpha=0.5)
plt.scatter(unreg_clusters.cluster_centers_[:, 0], unreg_clusters.cluster_centers_[:, 1], marker='x', color='red', s=100)
plt.title('PCA for Unegulated CVET')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(handles=generate_legend_items(max_label_reg + 1))

plt.subplot(2, 2, 4)
plt.scatter(Unreg_tsne[:, 0], Unreg_tsne[:, 1], c=label_subset_unreg, cmap='viridis', alpha=0.5)
plt.scatter(unreg_clusters.cluster_centers_[:, 0], unreg_clusters.cluster_centers_[:, 1], marker='x', color='red', s=100)
plt.title('t-SNE for Unegulated CVET')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.legend(handles=generate_legend_items(max_label_reg + 1))

plt.tight_layout()
plt.show()