In [128]:

#!/usr/bin/python 
# 
# Created by Albert Au Yeung (2010) 
# 
# An implementation of matrix factorization 
# 
try: 
    import numpy
    import csv
    import pandas as pd
except: 
    print("This implementation requires the numpy module.") 
    exit(0) 
pd.set_option('display.max_columns', None)

 
############################################################################### 
 
""" 
@INPUT: 
    R     : a matrix to be factorized, dimension N x M 
    P     : an initial matrix of dimension N x K 
    Q     : an initial matrix of dimension M x K 
    K     : the number of latent features 
    steps : the maximum number of steps to perform the optimisation 
    alpha : the learning rate 
    beta  : the regularization parameter 
@OUTPUT: 
    the final matrices P and Q 
""" 
def matrix_factorization(R, P, Q, K, steps=2000000, alpha=0.0002, beta=0.02): 
    Q = Q.T 
    previous_e = 0 
    for step in range(steps): 
        for i in range(len(R)): 
            for j in range(len(R[i])): 
                if R[i][j] > 0: 
                    eij = R[i][j] - numpy.dot(P[i,:],Q[:,j]) 
                    for k in range(K): 
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k]) 
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j]) 
        eR = numpy.dot(P,Q) 
        e = 0 
        for i in range(len(R)): 
            for j in range(len(R[i])): 
                if R[i][j] > 0: 
                    e = e + pow(R[i][j] - numpy.dot(P[i,:],Q[:,j]), 2) 
                    for k in range(K): 
                        e = e + (beta/2) * ( pow(P[i][k],2) + pow(Q[k][j],2) ) 
        error = abs(previous_e - e)
#         print(error)
        if error < 0.001: 
            break 
        previous_e = e 
#     print('step to converge: ', step) 
#     print('newest approximation error: ', e) 
#     print('last approximation error: ', previous_e) 
    return P, Q.T 
 
############################################################################### 
 
if __name__ == "__main__": 

    R = [
         [0,0,0,0,21],
         [0,35.2,27.08,0,0],
         [0,0,0,9.35,0],
         [8.71,0,0,0,0]
        ]
    print(numpy.shape(numpy.array(R)))
    df = pd.DataFrame(R, columns=['長和','中電控股','香港中華煤氣','九龍倉集團','滙豐控股'])
 
 
    R = numpy.array(R, dtype=numpy.float64) 
 
    N = len(R) 
    M = len(R[0]) 
    K = 10
 
    P = numpy.random.rand(N,K) 
    Q = numpy.random.rand(M,K) 
 
    nP, nQ = matrix_factorization(R, P, Q, K) 
    nQ_T = numpy.transpose(nQ)
    approximate_matrix = numpy.dot(nP, nQ_T)
    approximate_df = pd.DataFrame(approximate_matrix, columns=['長和','中電控股','香港中華煤氣','九龍倉集團','滙豐控股'])
    
    numpy.savetxt('approximate_matrix.csv', approximate_matrix, delimiter=',')


(4, 5)


In [70]:

df.max(axis=1)

0    21.39
1    35.20
2    17.96
3    33.03
4     1.00
dtype: float64

In [71]:
df

Unnamed: 0,長和,中電控股,香港中華煤氣,九龍倉集團,滙豐控股,電能實業,恒生銀行,恒基地產,新鴻基地產,新世界發展,太古股份公司A,東亞銀行,銀河娛樂,港鐵公司,信和置業,恒隆地產,昆侖能源,招商局港口,中國旺旺,吉利汽車,中信股份,國泰航空,中國石油化工股份,香港交易所,中國海外發展,騰訊控股,中國聯通,領展房產基金,華潤電力,中國石油股份,中國海洋石油,建設銀行,中國移動,聯想集團,長江基建集團,恒安國際,中國神華,華潤置地,長實地產,友邦保險,工商銀行,百麗國際,金沙中國有限公司,瑞聲科技,中國平安,蒙牛乳業,中銀香港,中國人壽,交通銀行,中國銀行
0,0.0,0.0,0.0,0.0,21.39,0.0,3.2,0.0,0.0,0.0,0.0,1.22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,5.81,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.82,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.83,9.76,0.0,0.0,0.0,6.65,0.0,3.27,4.81,1.34,7.88
1,0.0,35.2,27.08,0.0,0.0,21.37,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.56,0.0,0.0,0.0,0.0,0.0,9.79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,9.35,0.0,0.0,0.0,6.29,17.41,6.34,0.0,0.0,0.0,0.0,4.83,4.61,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,10.4,0.0,0.0,15.56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.23,17.96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8.71,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.23,0.0,3.2,2.02,0.0,0.0,0.73,0.95,1.09,2,2.26,0.37,5.15,0.0,0.0,33.03,2.12,0.0,0.0,3.53,5.14,0.0,16.76,1.12,0.0,1.27,2.06,0.0,0.0,0.0,0.0,1.31,2.68,1.93,0.0,1.37,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [72]:
approximate_df


Unnamed: 0,長和,中電控股,香港中華煤氣,九龍倉集團,滙豐控股,電能實業,恒生銀行,恒基地產,新鴻基地產,新世界發展,太古股份公司A,東亞銀行,銀河娛樂,港鐵公司,信和置業,恒隆地產,昆侖能源,招商局港口,中國旺旺,吉利汽車,中信股份,國泰航空,中國石油化工股份,香港交易所,中國海外發展,騰訊控股,中國聯通,領展房產基金,華潤電力,中國石油股份,中國海洋石油,建設銀行,中國移動,聯想集團,長江基建集團,恒安國際,中國神華,華潤置地,長實地產,友邦保險,工商銀行,百麗國際,金沙中國有限公司,瑞聲科技,中國平安,蒙牛乳業,中銀香港,中國人壽,交通銀行,中國銀行
0,9.078711,21.11039,16.013427,8.355616,21.353944,12.66004,3.196882,4.719655,15.480475,6.090046,4.012103,1.220646,5.540061,2.923705,4.808542,3.256897,1.664599,2.777718,3.974858,4.533733,4.522345,2.187577,7.661081,5.801772,8.962217,30.046776,4.679298,14.48203,4.456234,3.668548,4.548382,17.789591,16.872498,3.887211,7.447482,0.610037,2.58916,7.333301,15.99483,16.800831,9.745075,2.817107,3.716277,2.551695,6.641329,3.186267,3.268158,4.805117,1.341777,7.868737
1,13.292473,35.179239,27.064014,11.791284,24.389626,21.357435,5.689622,8.591359,21.625876,7.957513,6.443636,3.95705,9.850703,5.064101,5.822254,6.274344,3.006413,5.434911,3.614587,3.638934,6.128707,2.490521,9.02652,6.891513,13.273927,41.217124,4.441114,17.847981,6.556234,7.293004,7.091326,19.939155,23.012312,5.937604,9.784333,3.864297,3.013279,9.279821,21.637772,19.489509,10.950643,5.407838,5.0144,7.184102,8.860545,5.182468,7.798117,8.506401,3.887591,10.490373
2,9.659084,24.247945,18.54788,9.337201,21.762317,14.932454,3.955677,6.281181,17.384638,6.33204,5.20515,2.816955,6.714438,3.463553,4.823347,4.604477,1.979435,3.690972,3.58055,3.809255,5.310911,2.399946,8.064433,6.185684,10.385173,33.119875,4.35475,15.536387,5.104806,4.906441,5.726608,16.757616,18.854133,4.419493,7.999727,2.568394,3.192019,7.219709,17.932005,16.435639,9.502261,3.291282,4.227487,4.60378,7.429881,4.3288,4.970363,6.191043,2.650512,9.271336
3,8.624379,19.073714,13.719835,8.050899,15.668727,12.583799,3.951961,4.103658,13.62795,7.00203,1.236775,1.264142,3.203036,2.023378,2.924165,4.127714,0.736769,0.956628,1.095224,2.001102,2.261733,0.377626,5.148074,4.344514,9.175277,32.949293,2.121346,12.285706,4.115185,3.529352,5.132103,13.44803,16.725109,1.127526,6.279974,1.275029,2.060408,5.19283,12.525001,14.004045,8.019331,1.314275,2.680187,1.936284,6.277509,1.374944,3.47249,4.182426,2.903761,6.651898
4,1.344059,3.051873,2.316027,1.582281,4.64599,1.911285,0.552404,1.056233,3.061565,1.182077,1.070517,0.723638,0.963256,0.516807,0.907773,0.903278,0.433018,0.606433,0.804787,0.976611,1.191881,0.472081,1.904492,1.40079,2.057324,6.082353,0.949969,3.219486,0.961248,0.861679,1.218099,2.966005,3.44686,0.892041,1.363603,0.518737,1.070662,1.261423,3.493058,3.03273,2.079638,0.234943,1.018558,0.680555,1.441227,0.917945,0.800244,0.988835,0.508443,2.029689


In [83]:
for i in range(len(approximate_df)):
    print(approximate_df[df==0].iloc[i].sort_values(ascending=False, na_position = 'last').to_frame().T.iloc[:,0:5])

        騰訊控股      中電控股       中國移動     香港中華煤氣      長實地產
0  30.046776  21.11039  16.872498  16.013427  15.99483
        騰訊控股       滙豐控股       中國移動       長實地產      新鴻基地產
1  41.217124  24.389626  23.012312  21.637772  21.625876
        騰訊控股       中電控股       滙豐控股       中國移動    香港中華煤氣
2  33.119875  24.247945  21.762317  18.854133  18.54788
        中電控股       滙豐控股       友邦保險     香港中華煤氣     新鴻基地產
3  19.073714  15.668727  14.004045  13.719835  13.62795
       騰訊控股     滙豐控股      長實地產     中國移動    領展房產基金
4  6.082353  4.64599  3.493058  3.44686  3.219486


In [129]:
numpy.divide(R, R.sum(axis=1)[:,None], where=R.sum(axis=1)[:,None]!=0)

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  1.        ],
       [ 0.        ,  0.56518947,  0.43481053,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  1.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        ,  0.        ,  0.        ]])

In [126]:
R=numpy.array([[1,2]])

In [122]:
R.astype(numpy.float64)

array([[  0.  ,   0.  ,   0.  ,   0.  ,  21.  ],
       [  0.  ,  35.2 ,  27.08,   0.  ,   0.  ],
       [  0.  ,   0.  ,   0.  ,   9.35,   0.  ],
       [  8.71,   0.  ,   0.  ,   0.  ,   0.  ]])

In [119]:
normalized_R

[array([ 0.,  0.,  0.,  0.,  1.]),
 array([ 0.        ,  0.56518947,  0.43481053,  0.        ,  0.        ]),
 array([ 0.,  0.,  0.,  1.,  0.]),
 array([ 1.,  0.,  0.,  0.,  0.])]