In [6]:
import pandas as pd
import numpy as np
import os
from google.colab import drive
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD,PCA

In [7]:
drive.mount('/content/drive/')
os.chdir('/content/drive/MyDrive/')
%cd ./EECS545_fina_project/

Mounted at /content/drive/
/content/drive/.shortcut-targets-by-id/1hUsUSZXUcZhVIpSNrQgfnI4K-ViXmA90/EECS545_fina_project


In [9]:
#this is read 5000 body text generated 
from scipy import io
newm = io.mmread("vectorized_5000_X.mtx")
tfidf_matrix=newm.toarray()

In [96]:
def word_embed(tfidf_matrix,method='tsne'):
  if method=='tsne':
    from sklearn.manifold import TSNE
    final_em = TSNE(n_components=2, verbose=1, perplexity=100, random_state=42).fit_transform(tfidf_matrix)
  if method=='umap':
    !pip install umap-learn
    import umap.umap_ as umap
    reducer = umap.UMAP(n_neighbors=15,n_components=2,random_state=42)
    final_em = reducer.fit_transform(tfidf_matrix)
  if method=='svd':
    clf = TruncatedSVD(100)
    final_em = clf.fit_transform(tfidf_matrix)
  if method=='pca':
    pca = PCA(n_components=0.95, random_state=42)
    final_em = pca.fit_transform(tfidf_matrix)

  
  return final_em

def draw_t_SNE_plot(df,cluster_column):
  import seaborn as sns
  sns.set(rc={'figure.figsize':(15,15)})
  noise=df[df[cluster_column]==-1]
  not_noise=df[df[cluster_column]!=-1]
  not_noise_palette = sns.color_palette("Spectral", len(not_noise[cluster_column].unique()))
  ax = sns.scatterplot(noise['x'].values, noise['y'].values, hue=noise[cluster_column], legend=False)
  sns.scatterplot(not_noise['x'].values, not_noise['y'].values, hue=not_noise[cluster_column], legend=False, palette=not_noise_palette)
  plt.title(f"HDBSCAN TSNE SVD")
  plt.savefig('DBSCAN_HDBSCAN_all_labeled_visulization/TSNE_HDBSCAN_SVD.png',depi=300)

In [86]:
word_embed_result=word_embed(tfidf_matrix,'tsne')


In [None]:
 import seaborn as sns
 sns.set(rc={'figure.figsize':(15,15)})
sns.scatterplot(word_embed_result[:,0], word_embed_result[:,1])

In [92]:
#best parameters result
!pip install hdbscan
import hdbscan
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
from sklearn import metrics
DBS = DBSCAN(eps = 0.5,min_samples=10).fit(word_embed_result)
labels = DBS.labels_
hdbscan_labels = hdbscan.HDBSCAN(
        min_samples=10,min_cluster_size=8
    ).fit_predict(word_embed_result)




all_result_df=pd.DataFrame(np.concatenate((word_embed_result_temp,labels[:,np.newaxis],hdbscan_labels[:,np.newaxis]),axis=1) ,columns = ['x','y','labels','hdbscan_labels'])



In [None]:
draw_t_SNE_plot(all_result_df,'labels')

In [None]:
draw_t_SNE_plot(all_result_df,'hdbscan_labels')

In [None]:
#tune hyperparameters DBSCAN
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
from sklearn import metrics
plt.style.use('ggplot')
eps_list = [i/20 for i in range(1, 80, 1)]
min_sample_size_list=[i for i in range(2, 30, 1)]

#eps_list=[0.1,0.2,0.3,0.5,0.6,0.7,0.8,0.9,0.99,1,1.6,3,5,10]
# leaf_size_list=[10,30,50,100,1000]
DBSCAN_model=[]
sil_list=[]

# dissimilarity would not be defined for a single cluster, thus, minimum number of clusters should be 2
for eps in eps_list:
  for min_sample_size in min_sample_size_list:
    DBS = DBSCAN(eps = eps,min_samples=min_sample_size).fit(word_embed_result)
    DBSCAN_model.append(DBS)
    labels = DBS.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)
    print(eps,n_clusters_,n_noise_)
    index=np.where(labels!=-1)
    labels_new=labels[index]
    word_embed_new=word_embed_result[index]
    try:
      print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(word_embed_new, labels_new))
      sil_list.append(metrics.silhouette_score(word_embed_new, labels_new))
    except:
      print("cluster failed")
      sil_list.append(-1)
# plt.figure(figsize=(6,6))
# plt.plot(eps_list, sil_list, '-o')
# plt.title('Body text')
# plt.xlabel(r'min_sample_size')
# plt.ylabel('Sihouette Value')
# plt.savefig('DBSCAN_sihouettte_min_sample_size.png',depi=300)

In [None]:
##tune hdbscan hyperparameters
!pip install hdbscan
import hdbscan
from sklearn import metrics
import matplotlib.pyplot as plt
plt.style.use('ggplot')
#test h_db_scan
min_cluster_size_list=[i for i in range(3,50)]
min_sample_size_list=[i for i in range(2,30)]
label_list=[]
sil_list=[]
for min_cluster_size in min_cluster_size_list:
  for min_sample in min_sample_size_list:
      hdbscan_labels = hdbscan.HDBSCAN(
            algorithm='best', min_cluster_size=min_cluster_size,min_samples=min_sample
        ).fit_predict(word_embed_result)
      n_clusters_ = len(set(hdbscan_labels)) - (1 if -1 in hdbscan_labels else 0)
      n_noise_ = list(hdbscan_labels).count(-1)
      print(min_cluster_size,n_clusters_,n_noise_)
      index=np.where(hdbscan_labels!=-1)
      labels_new=hdbscan_labels[index]
      word_embed_new=word_embed_result[index]
      try:
        print("Silhouette Coefficient: %0.3f"% metrics.silhouette_score(word_embed_new, labels_new))
        sil_list.append(metrics.silhouette_score(word_embed_new, labels_new))
      except:
        print("cluster failed")
        sil_list.append(-1)

  
# plt.figure(figsize=(6,6))
# plt.plot(eps_list, sil_list, '-o')
# plt.title('Body text')
# plt.xlabel(r'min_sample_size')
# plt.ylabel('Sihouette Value')
# plt.savefig('HDBSCAN_sihouettte_min_sample_size.png',depi=300)