In [35]:
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.decomposition import NMF, LatentDirichletAllocation

import numpy as np
import pandas as pd

In [14]:
question = pd.read_csv("/home/pocket/input/questions.csv")
question.head(2)

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
0,0,0,0,1,51 131 162 38
1,1,1,1,1,131 36 81


In [113]:
df = question.copy()
tags = df["tags"].fillna(-1).str.split()
print(tags.shape)
tags = pd.get_dummies(tags.apply(pd.Series).fillna(-1).stack()).sum(level=0)
print(tags.shape)

(13523,)
(13523, 189)


In [115]:
tags.drop(columns=[-1], inplace=True)

In [116]:
tags.shape

(13523, 188)

In [117]:
nmf = NMF(n_components=20).fit(tags.T)

In [118]:
nmf.components_.shape

(20, 13523)

In [119]:
lda = LatentDirichletAllocation(n_components=20, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

In [120]:
lda.fit(tags.T)

LatentDirichletAllocation(learning_method='online', learning_offset=50.0,
                          max_iter=5, n_components=20, random_state=0)

In [121]:
lda_argmax = lda.components_.argmax(axis=0)

In [122]:
kmeans = KMeans(n_clusters=20)
raw_clusters = kmeans.fit_predict(tags)
nmf_clusters = kmeans.fit_predict(nmf.components_.T)
lda_clusters = kmeans.fit_predict(lda.components_.T)

In [123]:
raw_clusters.shape
raw_clusters
nmf_clusters
lda_clusters

array([ 1, 12, 18, ...,  9,  7, 11], dtype=int32)

In [124]:
output = pd.DataFrame({
    "contents_id": df["question_id"],
    "raw_clusters": raw_clusters,
    "nmf_clusters": nmf_clusters,
    "lda_clusters": lda_clusters,
    "lda_argmax": lda_argmax
})

In [125]:
output

Unnamed: 0,contents_id,raw_clusters,nmf_clusters,lda_clusters,lda_argmax
0,0,14,9,1,3
1,1,14,9,12,3
2,2,14,9,18,4
3,3,14,9,12,3
4,4,14,9,18,4
...,...,...,...,...,...
13518,13518,18,1,6,10
13519,13519,1,11,0,1
13520,13520,9,8,9,16
13521,13521,0,1,7,15


In [127]:
output.to_csv("./tag_cluster.csv", index=False)

In [2]:
contents_w2v = np.load("./w2v_content_embedding.npy")

In [3]:
contents_w2v = contents_w2v[1:]
contents_w2v.shape

(13523, 512)

In [6]:
svd = TruncatedSVD(n_components=20)
content_vecs = svd.fit_transform(contents_w2v)

In [8]:
content_vecs = pd.DataFrame(content_vecs)

In [9]:
content_vecs.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,3.161624,1.691552,2.205576,2.947486,-1.622994,-3.254926,2.167415,-3.629961,1.748464,0.509243,1.749342,0.932389,-1.625107,1.185443,2.061522,0.354314,3.440267,2.50375,-1.86216,0.427862
1,3.629113,2.045199,3.00547,2.878517,-1.094892,-3.275723,2.391864,-3.412076,1.372243,0.357086,0.842056,-0.447781,-2.061819,0.526491,1.626541,0.887823,3.848322,2.125844,-1.949963,-0.855733
2,3.958047,2.248505,-3.999284,6.128333,2.08588,-3.870665,6.805003,-2.929393,2.164355,2.322942,0.448248,1.93765,-0.786695,0.384824,1.688471,3.599606,0.094661,-2.696258,-1.061617,-0.006865
3,4.143218,1.069464,-0.950913,4.977015,1.725434,-2.830408,6.925851,-2.593976,2.22367,2.007153,-0.102928,0.59309,-0.475249,-0.148284,1.271681,1.336333,0.255513,-2.420286,-0.255309,-1.507089
4,4.780718,-0.59724,0.70593,3.13224,1.51418,-1.385974,4.085215,-0.771896,1.114255,0.468459,0.677376,-0.300456,1.272041,-0.049898,1.045809,0.435972,0.360007,-1.665184,-0.205277,-1.020311


In [10]:
content_vecs.reset_index(inplace=True)
content_vecs.columns = ["content_id"] + ["w2v_svd"+str(i) for i in range(20)]

In [11]:
content_vecs.head(2)

Unnamed: 0,content_id,w2v_svd0,w2v_svd1,w2v_svd2,w2v_svd3,w2v_svd4,w2v_svd5,w2v_svd6,w2v_svd7,w2v_svd8,...,w2v_svd10,w2v_svd11,w2v_svd12,w2v_svd13,w2v_svd14,w2v_svd15,w2v_svd16,w2v_svd17,w2v_svd18,w2v_svd19
0,0,3.161624,1.691552,2.205576,2.947486,-1.622994,-3.254926,2.167415,-3.629961,1.748464,...,1.749342,0.932389,-1.625107,1.185443,2.061522,0.354314,3.440267,2.50375,-1.86216,0.427862
1,1,3.629113,2.045199,3.00547,2.878517,-1.094892,-3.275723,2.391864,-3.412076,1.372243,...,0.842056,-0.447781,-2.061819,0.526491,1.626541,0.887823,3.848322,2.125844,-1.949963,-0.855733


In [12]:
content_vecs.to_csv("./content_w2v.csv", index=False)

In [32]:
embed = np.load("./content_embedding.npy")

In [33]:
embed.shape

(13524, 128)

In [34]:
embed[:, 0]

array([ 0.00348971,  0.01093413,  0.01815927, ..., -0.02049293,
       -0.06177155,  0.09914625], dtype=float32)

In [35]:
embed[1:, 0]

array([ 0.01093413,  0.01815927,  0.02203978, ..., -0.02049293,
       -0.06177155,  0.09914625], dtype=float32)

In [36]:
embed = embed[1:]

In [37]:
svd = TruncatedSVD(n_components=20)
content_vecs = svd.fit_transform(embed)

In [38]:
content_vecs.shape

(13523, 20)

In [39]:
content_vecs

array([[ 0.29857266,  0.0828594 ,  0.10337925, ..., -0.05289454,
        -0.04786237,  0.01002183],
       [ 0.29416677,  0.11227994, -0.04767716, ...,  0.05001845,
        -0.05255706,  0.02543679],
       [-0.22071125,  0.04720166,  0.08526865, ...,  0.00883699,
         0.01770988,  0.03280997],
       ...,
       [-0.00494652, -0.05653199, -0.04470852, ..., -0.02753389,
         0.03068023,  0.01990469],
       [ 0.25516766, -0.11038473,  0.04874474, ..., -0.04982777,
         0.06112582, -0.05504936],
       [ 0.43774703,  0.11116546, -0.13141455, ..., -0.09800366,
        -0.04020726,  0.04884924]], dtype=float32)

In [41]:
content_vecs = pd.DataFrame(content_vecs)

In [42]:
content_vecs.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.298573,0.082859,0.103379,0.019431,-0.108116,0.069238,-0.122686,0.03402,0.106384,0.013467,0.049322,0.044373,-0.019987,-0.051152,0.002223,-0.0274,-0.045768,-0.052895,-0.047862,0.010022
1,0.294167,0.11228,-0.047677,0.03068,-0.053393,-0.071304,-0.158679,-0.008623,0.041525,0.01641,-0.00572,-0.040753,-0.010178,0.045152,0.017016,0.018646,-0.058797,0.050018,-0.052557,0.025437


In [43]:
content_vecs.reset_index(inplace=True)

In [44]:
content_vecs.columns = ["content_id"] + ["nn_svd"+str(i) for i in range(20)]

In [45]:
content_vecs.head(2)

Unnamed: 0,content_id,nn_svd0,nn_svd1,nn_svd2,nn_svd3,nn_svd4,nn_svd5,nn_svd6,nn_svd7,nn_svd8,...,nn_svd10,nn_svd11,nn_svd12,nn_svd13,nn_svd14,nn_svd15,nn_svd16,nn_svd17,nn_svd18,nn_svd19
0,0,0.298573,0.082859,0.103379,0.019431,-0.108116,0.069238,-0.122686,0.03402,0.106384,...,0.049322,0.044373,-0.019987,-0.051152,0.002223,-0.0274,-0.045768,-0.052895,-0.047862,0.010022
1,1,0.294167,0.11228,-0.047677,0.03068,-0.053393,-0.071304,-0.158679,-0.008623,0.041525,...,-0.00572,-0.040753,-0.010178,0.045152,0.017016,0.018646,-0.058797,0.050018,-0.052557,0.025437


In [46]:
kmeans = KMeans(n_clusters=35)
content_clusters = kmeans.fit_predict(content_vecs)

In [47]:
content_clusters.shape

(13523,)

In [48]:
content_vecs["nn_cluster"] = content_clusters

In [49]:
content_vecs.to_csv("./content_nn.csv", index=False)