In [295]:
import numpy as np

# row:문서, column : 단어인 DTM이라고 가정
X = np.array([[1, 1, 0, 0, 0],
              [0, 1, 0, 0, 0],
              [0, 0, 1, 0, 0],
              [0, 0, 0, 1, 0]])

In [296]:
X.ndim, X.shape

(2, (4, 5))

In [297]:
X[0] # 0번째 문서에 대한 단어의 존재 유무 확인 가능

array([1, 1, 0, 0, 0])

In [298]:
X[:,0] 

array([1, 0, 0, 0])

In [299]:
# 각 문서들 간의 내적을 수행한 후 문서의 길이로 정규화(cosine)
X[0], X[1]
# X[0].dot(X[1]) / (norm(X[0])*X[1]) 을 하고 싶으나 인덱스를 안써야 하는 상황

(array([1, 1, 0, 0, 0]), array([0, 1, 0, 0, 0]))

In [300]:
X

array([[1, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0]])

In [301]:
X.T

array([[1, 0, 0, 0],
       [1, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 0]])

In [302]:
X[0], X.T[:, 0] # T를 했으므로 column에 대해 0으로 호출되어야 함. 

(array([1, 1, 0, 0, 0]), array([1, 1, 0, 0, 0]))

In [303]:
X.shape, X.T.shape

((4, 5), (5, 4))

In [304]:
_X = X.dot(X.T) # TDM이나 DTM할때는 innerproduct라는 변수에 저장했었음. 

In [305]:
# _X / X row마다 normalize해주고, X.T column마다 normalize해줘야 함. 
np.linalg.norm(X, axis=1) # linear algebra에 있는 norm함수 사용. 열에 대해서 수행하므로 axis=1

array([1.41421356, 1.        , 1.        , 1.        ])

In [306]:
from math import sqrt
sqrt(1+1) # 위의 norm이 잘 되었음을 확인할 수 있음.

1.4142135623730951

In [307]:
print(np.linalg.norm(X, axis=1).shape)
length = np.linalg.norm(X, axis=1)

(4,)


In [308]:
length * length.T

array([2., 1., 1., 1.])

In [309]:
length.shape, length.T.reshape(1,4).shape

((4,), (1, 4))

In [310]:
length.reshape(4,1) * length.T.reshape(1,4) # 0번째가 아닌 문서들은 길이가 1이라 1이 출력됨.

array([[2.        , 1.41421356, 1.41421356, 1.41421356],
       [1.41421356, 1.        , 1.        , 1.        ],
       [1.41421356, 1.        , 1.        , 1.        ],
       [1.41421356, 1.        , 1.        , 1.        ]])

In [311]:
_X / (length.reshape(4,1) * length.T.reshape(1,4)) # cosine simmilarity

array([[1.        , 0.70710678, 0.        , 0.        ],
       [0.70710678, 1.        , 0.        , 0.        ],
       [0.        , 0.        , 1.        , 0.        ],
       [0.        , 0.        , 0.        , 1.        ]])

In [312]:
C = np.array([[1, 0, 1, 0, 0, 0],
              [0, 1, 0, 0, 0, 0],
              [1, 1, 0, 0, 0, 0],
              [1, 0, 0, 1, 1, 0],
              [0, 0, 0, 1, 0, 1]])

In [313]:
import pandas as pd

V = ['ship', 'boat', 'ocean', 'wood', 'tree']
_C = pd.DataFrame(C, index=V)
_C

Unnamed: 0,0,1,2,3,4,5
ship,1,0,1,0,0,0
boat,0,1,0,0,0,0
ocean,1,1,0,0,0,0
wood,1,0,0,1,1,0
tree,0,0,0,1,0,1


In [314]:
U, sigma, V = np.linalg.svd(C) # singular vector decomposition (U, sigma, V 세 개의 행렬로 쪼개져서 나오므로 리턴값은 3개)

In [315]:
U.shape, sigma.shape, V.shape 

((5, 5), (5,), (6, 6))

In [316]:
pd.DataFrame(V) # V.shape가 (6,6)으로 나오는 이유는 np.linalg.svd의 인자에서 full_matrices가 True이기 때문. 

Unnamed: 0,0,1,2,3,4,5
0,0.748623,0.2797116,0.2036288,0.4465631,0.325096,0.121467
1,-0.286454,-0.5284591,-0.1857612,0.6255207,0.21988,0.405641
2,-0.2797116,0.748623,-0.4465631,0.2036288,-0.121467,0.325096
3,-4.162907e-16,1.066959e-15,0.5773503,2.382035e-16,-0.57735,0.57735
4,0.5284591,-0.286454,-0.6255207,-0.1857612,-0.405641,0.21988
5,-3.578044e-17,-9.252252e-18,3.578044e-17,-0.5773503,0.57735,0.57735


In [317]:
U, sigma, V = np.linalg.svd(C, full_matrices=False) 

In [318]:
U.shape, sigma.shape, V.shape # sigma.shape는 중요한 차원을 나열한 것. 

((5, 5), (5,), (5, 6))

In [319]:
pd.DataFrame(V)

Unnamed: 0,0,1,2,3,4,5
0,0.748623,0.2797116,0.203629,0.4465631,0.325096,0.121467
1,-0.286454,-0.5284591,-0.185761,0.6255207,0.21988,0.405641
2,-0.2797116,0.748623,-0.446563,0.2036288,-0.121467,0.325096
3,-4.162907e-16,1.066959e-15,0.57735,2.382035e-16,-0.57735,0.57735
4,0.5284591,-0.286454,-0.625521,-0.1857612,-0.405641,0.21988


In [320]:
sigma # digonal한 값들을 일렬로 쭉 늘어놓은 형태

array([2.16250096, 1.59438237, 1.27529025, 1.        , 0.39391525])

In [321]:
np.diag(sigma).shape

(5, 5)

In [322]:
np.diag(sigma[:2]).shape # 중요한 차원 2개만 쓰기

(2, 2)

In [323]:
U[:,:2].shape, V[:2].shape

((5, 2), (2, 6))

In [324]:
pd.DataFrame(U.dot(np.diag(sigma)).dot(V)) # 매트릭스를 분해했다가 합쳤음에도 아래 C와 근사한 값이 나온다. 

Unnamed: 0,0,1,2,3,4,5
0,1.0,2.866176e-16,1.0,-2.957499e-16,1.371194e-16,-3.70787e-16
1,-5.863986e-17,1.0,-1.655668e-16,1.602963e-16,7.954781e-17,-4.819879e-17
2,1.0,1.0,-5.924052e-16,-5.787719e-17,1.7335480000000002e-17,-1.454892e-16
3,1.0,6.760135000000001e-17,-1.334818e-16,1.0,1.0,2.535997e-16
4,-3.275686e-16,-5.436625e-16,-4.857263e-16,1.0,2.091058e-16,1.0


In [325]:
pd.DataFrame((U[:,:2].dot(np.diag(sigma[:2])).dot(V[:2]))) # 바로 윗 셀보다 차원을 더 줄인 결과
# 차원을 줄인(decomposition) 결과 필요없는 아웃라이어가 제거되고 중요한 값만 부각된다. 

Unnamed: 0,0,1,2,3,4,5
0,0.848146,0.515902,0.281625,0.12986,0.205743,-0.075882
1,0.360778,0.357508,0.155125,-0.205653,-0.025264,-0.180389
2,1.00327,0.718285,0.360778,-0.050529,0.155125,-0.205653
3,0.978006,0.12986,0.205743,1.028534,0.617139,0.411396
4,0.12986,-0.386042,-0.075882,0.898674,0.411396,0.487278


In [326]:
_C # 원본 행렬 C

Unnamed: 0,0,1,2,3,4,5
ship,1,0,1,0,0,0
boat,0,1,0,0,0,0
ocean,1,1,0,0,0,0
wood,1,0,0,1,1,0
tree,0,0,0,1,0,1


In [327]:
Ct = C.T # 원본
print(Ct.dot(Ct.T) / (np.linalg.norm(Ct, axis=1).reshape(Ct.shape[0], 1) * np.linalg.norm(Ct, axis=1).T))

[[1.         0.40824829 0.57735027 0.40824829 0.57735027 0.        ]
 [0.40824829 1.         0.         0.         0.         0.        ]
 [0.57735027 0.         1.         0.         0.         0.        ]
 [0.40824829 0.         0.         1.         0.70710678 0.70710678]
 [0.57735027 0.         0.         0.70710678 1.         0.        ]
 [0.         0.         0.         0.70710678 0.         1.        ]]


In [328]:
CC = U[:,:2].dot(np.diag(sigma[:2])).dot(V[:2]) # 차원축소
print(CC)

[[ 0.8481456   0.51590232  0.28162515  0.12986018  0.20574267 -0.07588249]
 [ 0.36077778  0.35750764  0.15512454 -0.20565325 -0.02526436 -0.18038889]
 [ 1.00327014  0.71828543  0.36077778 -0.05052871  0.15512454 -0.20565325]
 [ 0.97800578  0.12986018  0.20574267  1.0285345   0.61713858  0.41139591]
 [ 0.12986018 -0.38604214 -0.07588249  0.89867432  0.41139591  0.4872784 ]]


In [329]:
C.T.dot(C)

array([[3, 1, 1, 1, 1, 0],
       [1, 2, 0, 0, 0, 0],
       [1, 0, 1, 0, 0, 0],
       [1, 0, 0, 2, 1, 1],
       [1, 0, 0, 1, 1, 0],
       [0, 0, 0, 1, 0, 1]])

In [330]:
CCt = CC.T
pd.DataFrame(CCt.dot(CCt.T) / (np.linalg.norm(CCt, axis=1).reshape(CCt.shape[0], 1) * np.linalg.norm(CCt, axis=1).T))

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.781837,0.950136,0.474432,0.740118,0.110596
1,0.781837,1.0,0.937276,-0.177918,0.159375,-0.53319
2,0.950136,0.937276,1.0,0.176269,0.493512,-0.204841
3,0.474432,-0.177918,0.176269,1.0,0.943112,0.927362
4,0.740118,0.159375,0.493512,0.943112,1.0,0.750205
5,0.110596,-0.53319,-0.204841,0.927362,0.750205,1.0


In [331]:
# 각 열은 hidden variable인 잠재의미(클러스터)를 나타내고, 그때의 각 행(단어)은 해당 잠재의미에서의 중요도를 나타냄. 
pd.DataFrame(U[:,:2].dot(np.diag(sigma[:2])), index = ['ship', 'boat', 'ocean', 'wood', 'tree'])

Unnamed: 0,0,1
ship,0.952252,-0.472215
boat,0.279712,-0.528459
ocean,1.028335,-0.814913
wood,1.520282,0.558946
tree,0.56803,1.031162


In [332]:
data = pd.DataFrame(U[:,:2].dot(np.diag(sigma[:2])), index = ['ship', 'boat', 'ocean', 'wood', 'tree'])

In [333]:
# value를 갖고 sort하되 [0]번째를 기준으로 내림차순 정렬 수행. 이후 dict에 넣어 word_cloud에 넣기 좋은 형태로 반환
data.sort_values(by=[0], ascending=False)[0].to_dict() 

{'wood': 1.5202821146385517,
 'ocean': 1.028334651451026,
 'ship': 0.952251849979759,
 'tree': 0.568030264658793,
 'boat': 0.2797116034996999}

In [334]:
cluster1 = data.sort_values(by=[1], ascending = False)
cluster1[cluster1[1] > 0][1].to_dict() # 1번째 latent semantic vector에서 두 단어가 관계가 있음을 확인할 수 있다. 

{'tree': 1.031161645026429, 'wood': 0.5589464674814445}

In [335]:
# 문서 레벨에서 확인해보기
data = pd.DataFrame(np.diag(sigma[:3]).dot(V[:3]))

In [336]:
data # 이것을 시각화하면 K-means와는 또 다른 방식으로 클러스터링을 수행해 볼 수 있음. 

Unnamed: 0,0,1,2,3,4,5
0,1.618898,0.604877,0.440347,0.965693,0.70302,0.262673
1,-0.456717,-0.842566,-0.296174,0.997319,0.350572,0.646747
2,-0.356713,0.954712,-0.569498,0.259686,-0.154906,0.414592


In [337]:
clusterData = np.diag(sigma[:3]).dot(V[:3]) # 바로 윗 셀보다 차원을 더 줄인 결과
# 차원을 줄인(decomposition) 결과 필요없는 아웃라이어가 제거되고 중요한 값만 부각된다. 

In [338]:
CCt = clusterData.T
print(CCt.dot(CCt.T) / (np.linalg.norm(CCt, axis=1).reshape(CCt.shape[0], 1) * np.linalg.norm(CCt, axis=1).T))

[[ 1.          0.42223483  0.78542247  0.41805317  0.75047307 -0.01291529]
 [ 0.42223483  1.         -0.02533697 -0.00414719 -0.01597348  0.00853927]
 [ 0.78542247 -0.02533697  1.         -0.01640023  0.47162423 -0.49365825]
 [ 0.41805317 -0.00414719 -0.01640023  1.          0.87394629  0.87763513]
 [ 0.75047307 -0.01597348  0.47162423  0.87394629  1.          0.53404114]
 [-0.01291529  0.00853927 -0.49365825  0.87763513  0.53404114  1.        ]]


## 다른 예제

1. SVD => U, sigma, Vt (M,N) / (N, M), N개의 문서와 M개의 단어
2. U.sigma, U[:, :K].sigma[:K] => Latent Semantic 차원에서 어느 문서가 중요한지 찾기
3. sigma.Vt, sigma[:K].Vt[:K, :] => Latent Semantic 차원에서 어느 단어가 중요한지
4. 2번으로부터 각 문서가 어느 문서와 유사한지 Latent Semantic Dimensions에서 찾기(내적을 통해 cosine구하기)
5. 3번으로부터 각 단어가 어느 단어와 유사한지 Latent Semantic Dimensions에서 찾기(내적을 통해 cosine구하기)

In [339]:
# DTM 구조로 되어 있음. 
NewC = np.array([[1, 1, 0, 0, 0, 0, 0],
                 [0, 0, 1, 1, 1, 0, 0],
                 [0, 1, 0, 0, 0, 1, 0],
                 [0, 0, 1, 0, 0, 0, 1],
                 [0, 0, 0, 1, 1, 0, 1],
                 [1, 0, 1, 0, 1, 1, 1]])

In [269]:
RowName = ['A', 'B', 'C', 'D', 'E', 'F']
ColumnName = ['cute', 'kitty', 'eat', 'rice', 'cake', 'hamster', 'bread']
_NewC = pd.DataFrame(NewC, index=RowName, columns=ColumnName)
_NewC


Unnamed: 0,cute,kitty,eat,rice,cake,hamster,bread
A,1,1,0,0,0,0,0
B,0,0,1,1,1,0,0
C,0,1,0,0,0,1,0
D,0,0,1,0,0,0,1
E,0,0,0,1,1,0,1
F,1,0,1,0,1,1,1


In [270]:
# 1번과정 수행하기
U, sigma, V = np.linalg.svd(_NewC) # singular vector decomposition (U, sigma, V 세 개의 행렬로 쪼개져서 나오므로 리턴값은 3개)

In [271]:
U.shape, sigma.shape, V.shape

((6, 6), (6,), (7, 7))

In [272]:
pd.DataFrame(V) # V.shape가 (7,7)으로 나오는 이유는 np.linalg.svd의 인자에서 full_matrices가 True이기 때문. 

Unnamed: 0,0,1,2,3,4,5,6
0,0.2706786,0.07887421,0.4901791,0.2966641,0.5279056,0.2706786,0.4901791
1,0.4630926,0.606021,-0.07022424,-0.3810003,-0.2209182,0.4630926,-0.07022424
2,0.03630407,-0.4713829,0.3826723,-0.6078533,-0.3358578,0.03630407,0.3826723
3,2.53102e-15,-2.53102e-15,0.7071068,-8.001376e-16,1.891085e-15,3.736785e-16,-0.7071068
4,-0.7071068,4.732842e-16,1.898339e-15,-5.7336e-16,-1.043752e-15,0.7071068,-1.555734e-16
5,0.3464841,-0.5598272,-0.3291731,-0.1835952,0.4428025,0.3464841,-0.3291731
6,0.3015113,-0.3015113,-5.4540110000000006e-17,0.6030227,-0.6030227,0.3015113,1.675045e-16


In [364]:
U, sigma, V = np.linalg.svd(_NewC, full_matrices=False) 

In [365]:
U.shape, sigma.shape, V.shape # sigma.shape는 중요한 차원을 나열한 것. 

((6, 6), (6,), (6, 7))

In [366]:
pd.DataFrame(V) # 정상적인 결과

Unnamed: 0,0,1,2,3,4,5,6
0,0.2706786,0.07887421,0.4901791,0.2966641,0.5279056,0.2706786,0.4901791
1,0.4630926,0.606021,-0.07022424,-0.3810003,-0.2209182,0.4630926,-0.07022424
2,0.03630407,-0.4713829,0.3826723,-0.6078533,-0.3358578,0.03630407,0.3826723
3,2.53102e-15,-2.53102e-15,0.7071068,-8.001376e-16,1.891085e-15,3.736785e-16,-0.7071068
4,-0.7071068,4.732842e-16,1.898339e-15,-5.7336e-16,-1.043752e-15,0.7071068,-1.555734e-16
5,0.3464841,-0.5598272,-0.3291731,-0.1835952,0.4428025,0.3464841,-0.3291731


In [367]:
sigma # digonal한 값들을 일렬로 쭉 늘어놓은 형태

array([2.9771718 , 1.87837842, 1.35866397, 1.        , 1.        ,
       0.87302623])

In [368]:
np.diag(sigma).shape

(6, 6)

In [369]:
np.diag(sigma[:3]).shape # 중요한 차원 3개만 쓰기

(3, 3)

In [370]:
# 2번과정 수행해보기
print(U[:,:3].shape, V[:3].shape)

(6, 3) (3, 7)


In [372]:
pd.DataFrame(U.dot(np.diag(sigma)).dot(V)) # 매트릭스를 분해했다가 합쳤음에도 _NewC와 근사한 값이 나온다. 

Unnamed: 0,0,1,2,3,4,5,6
0,1.0,1.0,2.589093e-16,-5.0264360000000006e-17,-1.956608e-16,-1.269952e-16,2.586652e-16
1,-2.615345e-16,4.981863e-16,1.0,1.0,1.0,7.535571e-16,4.728302e-16
2,-1.382397e-15,1.0,-6.544914e-16,6.746307e-16,-9.979993e-17,1.0,-2.661574e-16
3,-2.541553e-16,2.721308e-16,1.0,4.218548e-16,1.546724e-16,-1.075396e-16,1.0
4,-1.659528e-16,4.095335e-16,3.384808e-16,1.0,1.0,2.801495e-16,1.0
5,1.0,-1.36836e-15,1.0,-2.687597e-16,1.0,1.0,1.0


In [382]:
CC = pd.DataFrame((U[:,:3].dot(np.diag(sigma[:3])).dot(V[:3]))) # 바로 윗 셀보다 차원을 더 줄인 결과 
# 차원을 줄인(decomposition) 결과 필요없는 아웃라이어가 제거되고 중요한 값만 부각된다. 
print(CC)

          0         1         2         3         4         5         6
0  0.573920  0.880565 -0.070227 -0.039169  0.094469  0.573920 -0.070227
1  0.024242 -0.039169  0.476969  0.987155  1.030981  0.024242  0.476969
2  0.573920  0.880565 -0.070227 -0.039169  0.094469  0.573920 -0.070227
3  0.228106 -0.368560  0.783290 -0.120869  0.291517  0.228106  0.783290
4  0.024242 -0.039169  0.476969  0.987155  1.030981  0.024242  0.476969
5  0.834580  0.267275  1.157155  0.087653  0.788595  0.834580  1.157155


In [374]:
pd.DataFrame(_NewC) # 데이터 원본

Unnamed: 0,cute,kitty,eat,rice,cake,hamster,bread
A,1,1,0,0,0,0,0
B,0,0,1,1,1,0,0
C,0,1,0,0,0,1,0
D,0,0,1,0,0,0,1
E,0,0,0,1,1,0,1
F,1,0,1,0,1,1,1


In [375]:
# ?????

Ct = _NewC.T
print(Ct)
print(Ct.dot(Ct.T) / (np.linalg.norm(Ct, axis=1).reshape(Ct.shape[0], 1) * np.linalg.norm(Ct, axis=1).T))

         A  B  C  D  E  F
cute     1  0  0  0  0  1
kitty    1  0  1  0  0  0
eat      0  1  0  1  0  1
rice     0  1  0  0  1  0
cake     0  1  0  0  1  1
hamster  0  0  1  0  0  1
bread    0  0  0  1  1  1
             cute  kitty       eat      rice      cake   hamster     bread
cute     1.000000    0.5  0.408248  0.000000  0.408248  0.500000  0.408248
kitty    0.500000    1.0  0.000000  0.000000  0.000000  0.500000  0.000000
eat      0.408248    0.0  1.000000  0.408248  0.666667  0.408248  0.666667
rice     0.000000    0.0  0.408248  1.000000  0.816497  0.000000  0.408248
cake     0.408248    0.0  0.666667  0.816497  1.000000  0.408248  0.666667
hamster  0.500000    0.5  0.408248  0.000000  0.408248  1.000000  0.408248
bread    0.408248    0.0  0.666667  0.408248  0.666667  0.408248  1.000000


In [383]:
CCt = CC.T
pd.DataFrame(CCt.dot(CCt.T) / (np.linalg.norm(CCt, axis=1).reshape(CCt.shape[0], 1) * np.linalg.norm(CCt, axis=1).T))

Unnamed: 0,0,1,2,3,4,5,6
0,1.0,0.728798,0.589043,0.029075,0.440706,1.0,0.589043
1,0.728798,1.0,-0.068071,-0.042011,0.084325,0.728798,-0.068071
2,0.589043,-0.068071,1.0,0.436654,0.804205,0.589043,1.0
3,0.029075,-0.042011,0.436654,1.0,0.869177,0.029075,0.436654
4,0.440706,0.084325,0.804205,0.869177,1.0,0.440706,0.804205
5,1.0,0.728798,0.589043,0.029075,0.440706,1.0,0.589043
6,0.589043,-0.068071,1.0,0.436654,0.804205,0.589043,1.0


In [379]:
_NewC.T.dot(_NewC)

Unnamed: 0,cute,kitty,eat,rice,cake,hamster,bread
cute,2,1,1,0,1,1,1
kitty,1,2,0,0,0,1,0
eat,1,0,3,1,2,1,2
rice,0,0,1,2,2,0,1
cake,1,0,2,2,3,1,2
hamster,1,1,1,0,1,2,1
bread,1,0,2,1,2,1,3


In [393]:
# 각 열은 hidden variable인 잠재의미(클러스터)를 나타내고, 그때의 각 행(단어)은 해당 잠재의미에서의 중요도를 나타냄. 
pd.DataFrame(U[:,:3].dot(np.diag(sigma[:3])), columns = ['cute', 'kitty', 'eat', 'rice', 'cake', 'hamster', 'bread'])

ValueError: Shape of passed values is (3, 6), indices imply (7, 6)