<a href="https://colab.research.google.com/github/yuy8146/datayouthcampus2021/blob/master/project/collaborativeFiltering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Collaborative filtering

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### 0. Read Dataset

In [None]:
#print("Read Dataset")
#purchase = pd.read_csv('C:/Users/User/PycharmProjects/collaborativeFiltering/dataset/purchase.csv')
#purchase = pd.read_excel('C:/Users/User/PycharmProjects/collaborativeFiltering/dataset/purchase.xlsx', sheet_name = 'sample')

In [4]:
purchase = pd.read_csv('/content/drive/MyDrive/worksoutProject/testdata200.csv')

### 1. preprocessing

In [5]:
# 1차 카테고리 column 생성
purchase.insert(2, 'CATEGORY1', purchase['CATEGORY2'].str[:2])

#반품 row -> /2 처리
purchase['수량'] = pd.to_numeric(purchase['수량'])
purchase['re_qty'] = np.where(purchase['구매구분'] == '반품', purchase['수량']/2, purchase['수량'])

#re_quntity 합계해서 5개에서 150개에 해당하는 고객 select
Cudf = pd.DataFrame()
sumCustomer = purchase.groupby('고객코드')['re_qty'].agg(**{'q_sum':'sum'}).reset_index()
sumCustomer2 = sumCustomer[5<=sumCustomer.q_sum]
selCustomer = sumCustomer2[sumCustomer2.q_sum<=150]
selCustomer = selCustomer.rename(columns={"고객코드":"customerCode"}).reset_index()
selCustomer = selCustomer.drop(['index'], axis=1)

# select한 고객 df 기반으로 해당 고객 row만 select
def customerFunction(i):
    cust = purchase[purchase['고객코드'] == selCustomer.loc[i].customerCode]
    global Cudf
    Cudf = Cudf.append(cust, ignore_index=True)

print("Preprocessing Start")
i = 0
for i in tqdm(selCustomer.index):
    customerFunction(i)
    i = i + 1

#필요없는 열 삭제, rename
#Cudf=Cudf.drop(['SEASONGROUP명','YEAR명','구매매장명','품번2','카테고리0','카테고리1','CATEGORY2'], axis=1)
Cudf.rename(columns={'고객코드':'customerCode','BRAND명':'BRAND' },inplace=True)

#반품 열 삭제
reData = Cudf[Cudf['구매구분'] != '반품']
finalData = pd.DataFrame(columns=range(0))
pc = finalData.append(reData,ignore_index=True)


Preprocessing Start


100%|██████████| 8/8 [00:00<00:00, 221.37it/s]


### 2. make matrix

In [6]:
def makelayers(pc,exceptduplicates, attribute): #pc = purchase dataset

    # 고객코드 추출
    customercode = list(set(pc['customerCode']))
    customercode.sort()
    # 카테고리 추출
    category = list(set(pc[attribute]))
    category.sort()

    # 행렬곱 위한 왼쪽 matrix 완성 (user x category)
    # print(len(customercode), len(category))
    array = [[0 for col in range(len(category))] for row in range(len(customercode))]   # 빈 matrix 준비

    #구매갯수만큼 채워넣기
    for i in tqdm(range(len(customercode))):
        for j in range(len(category)):
            array[i][j] = len(pc[(pc['customerCode']== customercode[i]) & (pc[attribute] == category[j])])

    left = np.array(array)

    #오른쪽
    #행렬곱위한 오른쪽 matrix 완성 (item x category)
    #onehotencoding_c=pd.get_dummies(pc[[attribute]], columns = [attribute])
    onehotencoding_c=pd.get_dummies(exceptduplicates[[attribute]], columns = [attribute])
    right = np.array(onehotencoding_c)

    #행렬곱하여 최종 matrix 완성
    final = np.matmul(left, np.transpose(right))
    #final.shape

    return final

In [7]:
exceptduplicates = pc.drop_duplicates(['품번'])   #중복 제품 삭제
#exceptduplicates.insert(2, 'CATEGORY1', pc['CATEGORY2'].str[:2])

# 품번 추출
itemcode = list(exceptduplicates['품번'])
# 고객코드 추출
customercode = list(set(pc['customerCode']))
customercode.sort()

#행렬곱한 layer들 생성
print("\n** brand matrix **")
brandmatrix = makelayers(pc,exceptduplicates, 'BRAND')

print("\n** category matrix **")
categorymatrix = makelayers(pc, exceptduplicates, 'CATEGORY1')

print("\n** color matrix **")
colormatrix = makelayers(pc,exceptduplicates, '색상')



** brand matrix **


100%|██████████| 8/8 [00:00<00:00, 128.21it/s]



** category matrix **


100%|██████████| 8/8 [00:00<00:00, 108.12it/s]



** color matrix **


100%|██████████| 8/8 [00:00<00:00, 122.87it/s]


### 3. nomalization

In [8]:
scaler = StandardScaler()

# column별로 scalar 되기에 transpose 해줌.
scaled_brandmatrix_T = scaler.fit_transform(brandmatrix.T)
scaled_categorymatrix_T = scaler.fit_transform(categorymatrix.T)
scaled_colormatrix_T = scaler.fit_transform(colormatrix.T)

scaled_brandmatrix = scaled_brandmatrix_T.T
scaled_categorymatrix = scaled_categorymatrix_T.T
scaled_colormatrix = scaled_colormatrix_T.T

### 4. weighted sum

In [9]:
# R= (1.293 * scaled_brandmatrix) + (1.221 * scaled_categorymatrix) + scaled_colormatrix
R= (1.293 * scaled_brandmatrix) + (1.221 * scaled_categorymatrix)
   #+ scaled_colormatrix

print("\n** result ** \n", R)
print("shape: ", R.shape)
np.savetxt('beforescalar.csv',R, fmt= '%f', delimiter=",")

Scaler = RobustScaler()
scaled_result = Scaler.fit_transform(R.T)
R = scaled_result.T
np.savetxt('afterscalar.csv',R, fmt= '%f', delimiter=",")

#print("\n** customercode ** \n", customercode)
#print("length: ", len(customercode))
#print("\n** itemcode ** \n", itemcode)
#print("length: ", len(itemcode))


** result ** 
 [[ 3.24403986  3.94321481  3.94321481  5.3799945   5.3799945  -0.32869446
  -0.32869446 -1.76547414 -0.32869446 -1.76547414  1.10808522 -1.76547414
  -1.76547414  1.10808522  1.10808522  1.10808522 -1.76547414 -1.76547414
  -1.76547414 -1.76547414  1.10808522 -1.76547414  1.10808522  1.10808522
   1.10808522  1.10808522 -1.76547414 -0.32869446 -1.76547414 -1.76547414
  -1.76547414 -1.76547414 -0.32869446  1.10808522 -1.76547414 -1.76547414
  -1.76547414 -0.32869446 -0.32869446 -0.32869446 -0.32869446]
 [-1.51087806  1.46809911 -1.51087806 -0.91508263 -0.91508263  3.48335099
   3.48335099  3.11542113  5.49860286  3.11542113  3.11542113 -0.91508263
  -0.91508263 -1.51087806 -1.51087806 -0.91508263 -0.91508263 -1.51087806
  -1.51087806 -1.51087806 -1.51087806 -1.51087806 -0.91508263 -1.51087806
  -1.51087806 -1.51087806 -1.51087806  1.46809911 -1.51087806 -1.51087806
  -1.51087806 -1.51087806  1.46809911 -1.51087806  0.50437382  0.50437382
   0.50437382  1.46809911  1.4680

### 5. matrix factorization

In [10]:
# Set up loss function
# C: confidence matrix
# P: binary rating matrix
# X: user latent matrix
# Y: item latent matrix
# r_lambda: regularization lambda
# xTy: predict matrix
# Total_loss = (confidence_level * predict loss) + regularization loss

def loss_function(C, P, xTy, X, Y, r_lambda):
    predict_error = np.square(P - xTy)
    confidence_error = np.sum(C * predict_error)
    regularization = r_lambda * (np.sum(np.square(X)) + np.sum(np.square(Y)))
    total_loss = confidence_error + regularization
    return np.sum(predict_error), confidence_error, regularization, total_loss


# Optimization Function for user and item
# X[u] = (yTCuy + lambda*I)^-1yTCuy
# Y[i] = (xTCix + lambda*I)^-1xTCix
# two formula is the same when it changes X to Y and u to i

def optimize_user(X, Y, C, P, nu, nf, r_lambda):
    yT = np.transpose(Y)
    for u in range(nu):
        Cu = np.diag(C[u])
        yT_Cu_y = np.matmul(np.matmul(yT, Cu), Y)
        lI = np.dot(r_lambda, np.identity(nf))
        yT_Cu_pu = np.matmul(np.matmul(yT, Cu), P[u])
        X[u] = np.linalg.solve(yT_Cu_y + lI, yT_Cu_pu)

def optimize_item(X, Y, C, P, ni, nf, r_lambda):
    xT = np.transpose(X)
    for i in range(ni):
        Ci = np.diag(C[:, i])
        xT_Ci_x = np.matmul(np.matmul(xT, Ci), X)
        lI = np.dot(r_lambda, np.identity(nf))
        xT_Ci_pi = np.matmul(np.matmul(xT, Ci), P[:, i])
        Y[i] = np.linalg.solve(xT_Ci_x + lI, xT_Ci_pi)

In [11]:
# initialize parameters
r_lambda = 40   # normalization parameter
nf = 200        # dimension of latent vector of each user and item
alpha = 40      # confidence level

# R as the dataset (array)

# initialize user and item latent factor matrix
nu = R.shape[0] # number of users
ni = R.shape[1] # number of items

# initialize X and Y with very small values
X = np.random.rand(nu, nf) * 0.01
Y = np.random.rand(ni, nf) * 0.01

# initialize Binary rating matrix P: convert original rating matrix R to P
P = np.copy(R)
P[P > 0] = 1

# Initialize Confidence Matrix C
C = 1 + alpha * R   # confidence level of certain rating data

# Train
predict_errors = []
confidence_errors = []
regularization_list = []
total_losses = []

EPOCH = 15
for i in tqdm(range(EPOCH)):
    if i != 0:
        optimize_user(X, Y, C, P, nu, nf, r_lambda)
        optimize_item(X, Y, C, P, ni, nf, r_lambda)
    predict = np.matmul(X, np.transpose(Y))
    predict_error, confidence_error, regularization, total_loss = loss_function(C, P, predict, X, Y, r_lambda)

    predict_errors.append(predict_error)
    confidence_errors.append(confidence_error)
    regularization_list.append(regularization)
    total_losses.append(total_loss)

    print('\n ----------------step %d----------------' % int(i+1))
    print("predict error: %f" % predict_error)
    print("confidence error: %f" % confidence_error)
    print("regularization: %f" % regularization)
    print("total loss: %f" % total_loss)

predict = np.matmul(X, np.transpose(Y))
#print('final predict')
#np.set_printoptions(threshold=sys.maxsize)
print([predict])
np.savetxt('aftermf.csv',predict, fmt= '%f', delimiter=",")


 20%|██        | 3/15 [00:00<00:00, 21.91it/s]


 ----------------step 1----------------
predict error: 153.730861
confidence error: 2469.925943
regularization: 13.105357
total loss: 2483.031300

 ----------------step 2----------------
predict error: 5308.611227
confidence error: -15131.056820
regularization: 16694.930722
total loss: 1563.873902

 ----------------step 3----------------
predict error: 392.761815
confidence error: -2344.837345
regularization: 4998.754065
total loss: 2653.916720


 40%|████      | 6/15 [00:00<00:00, 16.25it/s]


 ----------------step 4----------------
predict error: 395.546212
confidence error: -177.572482
regularization: 12096.394539
total loss: 11918.822057

 ----------------step 5----------------
predict error: 6152.151986
confidence error: -53295.668676
regularization: 53429.964587
total loss: 134.295911

 ----------------step 6----------------
predict error: 5010.392577
confidence error: -8281.786507
regularization: 1753268.424752
total loss: 1744986.638245


 53%|█████▎    | 8/15 [00:00<00:00, 14.99it/s]


 ----------------step 7----------------
predict error: 1536.423575
confidence error: -23965.926625
regularization: 25022.263914
total loss: 1056.337289

 ----------------step 8----------------
predict error: 2159.054492
confidence error: -8637.092274
regularization: 41933.061993
total loss: 33295.969719

 ----------------step 9----------------
predict error: 2297.838868
confidence error: -31226.467203
regularization: 319492.374646
total loss: 288265.907443


 80%|████████  | 12/15 [00:00<00:00, 13.33it/s]


 ----------------step 10----------------
predict error: 15169.064187
confidence error: -75784.107329
regularization: 67383.824983
total loss: -8400.282346

 ----------------step 11----------------
predict error: 15064.248587
confidence error: -114716.832541
regularization: 144575.763111
total loss: 29858.930570

 ----------------step 12----------------
predict error: 109260.815996
confidence error: -2349428.471630
regularization: 2288759.735507
total loss: -60668.736124


100%|██████████| 15/15 [00:01<00:00, 14.08it/s]


 ----------------step 13----------------
predict error: 8154.829593
confidence error: -34606.702105
regularization: 264633.785665
total loss: 230027.083559

 ----------------step 14----------------
predict error: 940.241540
confidence error: -1650.082428
regularization: 3751.564405
total loss: 2101.481977

 ----------------step 15----------------
predict error: 562.704767
confidence error: -3253.932797
regularization: 55569.587856
total loss: 52315.655059
[array([[ 9.66412655e-01,  1.03026327e+00,  9.95545491e-01,
         9.89961547e-01,  9.89961547e-01,  3.49464348e+00,
         1.82904463e+00, -5.01731463e-01,  6.47027008e-01,
        -2.98338135e-01,  1.01434112e+00, -4.96371657e-01,
        -4.96371657e-01,  1.00195217e+00,  1.00195217e+00,
         9.95911435e-01, -4.39476927e-01, -4.27187893e-01,
        -5.06136849e-01, -4.20922192e-01,  1.00195217e+00,
        -4.64466581e-01,  9.61086539e-01,  1.00195217e+00,
         1.00195217e+00,  1.00195217e+00, -5.32374585e-01,
       




### 6. sorting


In [12]:
# return index of sorting list
sorted_result = [[0 for col in range(predict.shape[1])] for row in range(predict.shape[0])]   # null matrix
for i in range(predict.shape[0]):
    sorted_result[i] = sorted(range(len(predict[i])), key=lambda k: predict[i][k])

sorted_result=np.array(sorted_result)
#print("\n** sorted_result **")
#print(sorted_result)

#return item code
sorted_result_item = [[0 for col in range(predict.shape[1])] for row in range(predict.shape[0])]   # null matrix
for i in range(predict.shape[0]):
    for j in range(predict.shape[1]):
        #print(itemcode[sorted_result[i][j]])
        sorted_result_item[i][j]= itemcode[sorted_result[i][j]]

sorted_result_item = np.array(sorted_result_item)
#print("\n** item code **")
#print(sorted_result_item)

##push 'sorted_result_item' into DB

In [14]:
#("고객코드","제품코드" 나열하여 확인)
customercode = np.array(customercode)
customercode = customercode.reshape(len(customercode),1) #2차원 배열로 reshape
concatenate = np.concatenate((customercode, sorted_result_item), axis=1) #열방향 (좌 -> 우)로 연결
#print(hstack)
np.savetxt('/content/drive/MyDrive/worksoutProject/sample.csv',concatenate, fmt= '%s', delimiter=",")

### 7. select top-N


In [None]:
##pull 'sorted_result_item' from DB

#1.1. array에서 raffle 상품 제외
#1.2. array에서 현재 판매하지 않는 상품 제외
#1.3. 구매했었던 상품 제외
#2. 앞에서부터 20개 출력
#2.1. 만약 20개가 없다면 그 전까지만 출력하기
#3.