In [1]:
import pandas as pd
import numpy as np
import psycopg2 
import psycopg2.extras

In [2]:
conn = psycopg2.connect("dbname='bookstore_pr' user='student' host='132.249.238.27' password='123456'")
cur = conn.cursor()

# Load Data Files

In [3]:
asin = np.load('../../data/extracts/asin.npy')
ccm = np.load('../../data/derived/ccm_general.npy')
season_price_instock = np.load('../../data/extracts/season_price_instock_indexed.npy')
asin_rating = np.load('../../data/derived/rating_indexed.npy')
categories_indexed = np.load('../../data/extracts/categories_indexed.npy')
ccm1 = np.load('../../data/derived/cluster1_coo_matrix.npy')
ccm3 = np.load('../../data/derived/cluster3_coo_matrix.npy')
ccm4 = np.load('../../data/derived/cluster4_coo_matrix.npy')
ccm7 = np.load('../../data/derived/cluster7_coo_matrix.npy')

# Prepare Asin and Categories And Seasonal Information

In [4]:
asin_df = pd.DataFrame([i.decode() for i in asin],columns=['asin'])

cat_dfs = []
for i in range(len(categories_indexed)):
    cat_dfs.append(asin_df.join(pd.DataFrame(categories_indexed[i],columns=['lvl1','lvl2','lvl3','lvl4','lvl5'])))
    cat_dfs[i].set_index('asin', inplace=True)
    cat_dfs[i] = cat_dfs[i][cat_dfs[i].values.sum(axis=1) != 0]
    cat_dfs[i].reset_index(inplace=True)
    
season_df = pd.DataFrame(season_price_instock,columns=['spring','summer','fall','winter','fullprice','isinstock'])

# Content Based

In [5]:
content_based_df = asin_df \
                    .join(pd.DataFrame(asin_rating,columns=['metric'])) \
                    .join(season_df)
        
content_based_cat_df = content_based_df.set_index('asin').join(cat_dfs[0].set_index('asin'),how='inner').reset_index()

for i in range(1,len(cat_dfs)):
    content_based_cat_df = content_based_cat_df \
        .append( \
                content_based_df.set_index('asin') \
                .join(cat_dfs[i].set_index('asin') \
                      ,how='inner') \
                .reset_index())

In [6]:
cur.execute("truncate content_based;")
psycopg2.extras.execute_batch(cur,'INSERT INTO content_based VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',content_based_cat_df.values.tolist())
conn.commit()

# Collaborative

In [7]:
def ccm_to_df (ccm, cluster, asin_df, season_df, cat_dfs):
    ccm_list = []

    for i in range(len(ccm)):
        if np.where(ccm[i]>0)[0].size > 0:
            for x in np.where(ccm[i]>0)[0]:
                ccm_list.append([asin[i].decode(),asin[x].decode(),ccm[i][x]])
    
    info_df = asin_df.join(season_df)
    ccm_df = pd.DataFrame(ccm_list,columns=['asin','asin_other','metric'])
    
    ccm_cat_df = ccm_df.set_index('asin_other') \
                .join(cat_dfs[0].set_index('asin'),how='inner')

    for i in range(1,len(cat_dfs)):
        ccm_cat_df = ccm_cat_df \
            .append( \
                    ccm_df.set_index('asin_other') \
                    .join(cat_dfs[i].set_index('asin') \
                          ,how='inner'))
            
    ccm_cat_df = ccm_cat_df.join(info_df.set_index('asin'))
    ccm_cat_df.index.names = ['asin_other']
    ccm_cat_df = ccm_cat_df.reset_index()[['asin','asin_other','metric','spring','summer','fall','winter','fullprice','isinstock','lvl1','lvl2','lvl3','lvl4','lvl5']]
    ccm_cat_df['demo_region'] = cluster
    ccm_cat_df['demo_gender'] = 0
    
    return ccm_cat_df

In [8]:
cooccurrence = ccm_to_df(ccm, 0, asin_df, season_df, cat_dfs) \
                .append(ccm_to_df(ccm1, 1, asin_df, season_df, cat_dfs)) \
                .append(ccm_to_df(ccm3, 3, asin_df, season_df, cat_dfs)) \
                .append(ccm_to_df(ccm4, 4, asin_df, season_df, cat_dfs)) \
                .append(ccm_to_df(ccm7, 7, asin_df, season_df, cat_dfs))

In [9]:
cur.execute("truncate cooccurrence;")
psycopg2.extras.execute_batch(cur,'INSERT INTO cooccurrence VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',cooccurrence.values.tolist())
conn.commit()

In [26]:
conn.close()

# Testing