In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import random
import time
%matplotlib inline
import matplotlib.animation as animation
import csv
matplotlib.style.use('seaborn')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn import manifold
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster import hierarchy
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import pairwise_distances
# from wordcloud import WordCloud

from mpl_toolkits.mplot3d import Axes3D

In [2]:
author_data=pd.read_csv("author_data.csv")
author_data.head()

Unnamed: 0,id,gender,age,topic,sign,grupo_edad
0,2059027,male,15,Student,Leo,A
1,3581210,male,33,InvestmentBanking,Aquarius,C
2,3539003,female,14,indUnk,Aries,A
3,4172416,female,25,indUnk,Capricorn,B
4,3668238,female,17,Student,Gemini,A


# Clustering

In [3]:
topics_enc=pd.get_dummies(author_data.topic).astype("float64")
# gender_enc=pd.get_dummies(author_data.gender).astype("float64")
age_gr_enc=pd.get_dummies(author_data.grupo_edad).astype("float64")

In [4]:
encoded_authors=age_gr_enc.join(topics_enc)
# encoded_authors=gender_enc.join(encoded_authors)
encoded_authors.head()

Unnamed: 0,A,B,C,Accounting,Advertising,Agriculture,Architecture,Arts,Automotive,Banking,...,RealEstate,Religion,Science,Sports-Recreation,Student,Technology,Telecommunications,Tourism,Transportation,indUnk
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# auth_ward_full=hierarchy.linkage(encoded_authors, 'ward')

In [6]:
# plt.figure(figsize=(15,35))
# dn_tf_full = hierarchy.dendrogram(auth_ward_full,orientation="left")
# plt.title('Ward Link Dendrogram')
# plt.show()

In [None]:
# 3d Embedding for visualization
spect_emb_3=manifold.SpectralEmbedding(n_components=3, n_jobs=-1).fit_transform(encoded_authors)

In [None]:
auth_ward_3=hierarchy.linkage(spect_emb_3, 'ward')

In [None]:
plt.figure(figsize=(15,35))
dn_tf_3=hierarchy.dendrogram(auth_ward_3,orientation="left")
plt.title('Ward Link Dendrogram from 3d spectral embedding')
plt.show()

In [None]:
clustering_3d_ward = AgglomerativeClustering(linkage='ward', n_clusters=5)
clustering_3d_ward.fit(spect_emb_3)

In [None]:
pd.crosstab(index=clustering_3d_ward.labels_, columns="count")

In [None]:
spect_emb_3=spect_emb_3*10000

In [None]:
plt.figure(figsize=(10,30))
par_alf=0.1
plt.subplot(311)
plt.scatter(spect_emb_3[:,0],spect_emb_3[:,1], alpha=par_alf,s=45, c=clustering_3d_ward.labels_, cmap=plt.cm.Set1)
plt.title("2d spectral coordinates from 3d spectral hierarchical (ward) clustering")

plt.subplot(312)
plt.scatter(spect_emb_3[:,2],spect_emb_3[:,1], alpha=par_alf,s=45, c=clustering_3d_ward.labels_, cmap=plt.cm.Set1)
plt.title("2d spectral coordinates from 3d spectral hierarchical (ward) clustering")

plt.subplot(313)
plt.scatter(spect_emb_3[:,0],spect_emb_3[:,2], alpha=par_alf,s=45, c=clustering_3d_ward.labels_, cmap=plt.cm.Set1)
plt.title("2d spectral coordinates from 3d spectral hierarchical (ward) clustering")

In [None]:
# https://stackoverflow.com/questions/51457738/animating-a-3d-scatterplot-with-matplotlib-to-gif-ends-up-empty
fig = plt.figure(figsize=(15,15))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(xs=spect_emb_3[:,0],
           ys=spect_emb_3[:,1],
           zs=spect_emb_3[:,2],
           c=clustering_3d_ward.labels_, 
           alpha=0.35,
           cmap=plt.cm.Set1)
plt.title("Ward's hierarchical clustering from spectral embedding" )
plt.axis('off') # remove axes for visual appeal

def rotate(angle):
    ax.view_init(azim=angle)

print("Making animation")
rot_animation = animation.FuncAnimation(fig, rotate, frames=np.arange(0, 362, 2), interval=100)


In [None]:
# Set up formatting for the movie files
Writer = animation.writers['ffmpeg']
writer = Writer(fps=15, metadata=dict(artist='Me'), bitrate=1800)

fig = plt.figure(figsize=(15,15))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(xs=spect_emb_3[:,0],
           ys=spect_emb_3[:,1],
           zs=spect_emb_3[:,2],
           c=clustering_3d_ward.labels_, 
           alpha=0.35,
           cmap=plt.cm.Set1)
plt.title("Ward's hierarchical clustering from spectral embedding" )
# plt.axis('off') # remove axes for visual appeal

def rotate(angle):
    ax.view_init(azim=angle)

print("Making animation")
rot_animation = animation.FuncAnimation(fig, rotate, frames=np.arange(0, 362, 2), interval=100)
print("Saving Movie File")
rot_animation.save('cluster_anim.mp4', writer=writer)
print("Saving animated GIF File")
rot_animation.save('whljgr_clustering.gif', dpi=80, writer='imagemagick')