In [1]:
# connect
import psycopg2
import json
import numpy as np
import time
# more connectstr
# "dbname='sqlbook' user='postgres' host='localhost' password='xxxx'"

In [2]:
def get_connect_str(filename):
    inputfile = open(filename, 'r')

    for i in inputfile:
        connstr = json.loads(i.strip())
    
    inputfile.close()
    return connstr

In [3]:
def db_connect(connstr):
    conn = psycopg2.connect(connstr)    
    conn.set_session(autocommit = True)
    return conn.cursor()

In [4]:
localconn = db_connect(get_connect_str('connectstr'))
ucsdconn = db_connect(get_connect_str('connectstr'))
#ucsdconn = db_connect(get_connect_str('connucsd'))

In [5]:
def get_cust_item_list(cursor):
    freq = dict()
    items = []
    
    sql = """SELECT c.customerid, pr.asin
          FROM customers c, orders o, orderlines ol, products pr
          WHERE c.customerid = o.customerid
          and ol.orderid = o.orderid
          and pr.productid = ol.productid
          order by c.customerid """

    cursor.execute(sql)
    
    for i in cursor.fetchall():        
        if i[0] in freq:
            freq[i[0]].append(i[1])
        else:
            freq.update({i[0]:[i[1]]}) # item list per customer
        
        freq[i[0]] = list(set(freq[i[0]]))
        items.append(i[1]) # item list for all customers
    
    return freq, list(set(items))

In [6]:
customers, items = get_cust_item_list(ucsdconn)

In [7]:
def create_customer_matrix(freq, items):
    cols = len(items)
    lfreq = freq.keys()
    rows = len(lfreq)
    
    m = np.zeros((rows,cols))
    
    for r in range(rows):
        lst = freq[lfreq[r]] # get items per customer
        
        for i in lst: # go through each item
            m[r][items.index(i)] = 1 # update matrix
            
    return m, lfreq

In [8]:
cust_item_matrix, custlist = create_customer_matrix(customers, items)

In [9]:
np.save('cust_item_matrix.npy', cust_item_matrix)

In [10]:
np.save('asin.npy',items)

In [11]:
cust_item_matrix.shape

(189559, 3990)

In [12]:
def symmetric_matrix(matrix, items):
    rows, cols = matrix.shape
    m = np.zeros((cols,cols)) # init matrix
    
    for i in range(cols):
        t = np.sum(matrix[matrix[:,i] > 0],axis=0)
        t[i] = 0
        m[i] = t
        
    return m

In [13]:
gen_matrix = symmetric_matrix(cust_item_matrix, items)

In [14]:
gen_matrix.shape

(3990, 3990)

In [15]:
np.save('gen_matrix.npy', gen_matrix)

In [16]:
def get_cust_item_demo(cursor, custlist):    
    sql = """SELECT c.customerid, r.region, 
          case when c.gender='M' then 1 
          when c.gender='F' then 2 
          else 0 end as gender
          FROM customers c, orders o, regions r
          WHERE c.customerid = o.customerid
          and o.state = r.state
          order by c.customerid"""

    cursor.execute(sql)
    results = cursor.fetchall()
    lresults = len(results)
    m = np.zeros((lresults,2))
    
    for i in range(lresults):
        if results[i][0] != custlist[i]:
            print 'error: cust not matching'
            
        m[i] = np.array(results[i][1:])
        
    return m

In [17]:
demo_matrix = get_cust_item_demo(localconn, custlist)

In [18]:
demo_matrix.shape

(189559, 2)

In [19]:
np.save('demo_matrix.npy', demo_matrix)

In [20]:
def get_matrix(filename):
    inputs = open(filename)
    return np.load(inputs) 

In [21]:
asincopy = get_matrix('asin.npy')

In [22]:
len(asincopy)

3990

In [23]:
asincopy

array(['0007176953', '0002160587', '0006380905', ..., '0007244215',
       '0007106912', '0002005387'],
      dtype='|S10')

In [24]:
gen_matrix_copy = get_matrix('gen_matrix.npy')
gen_matrix_copy.shape

(3990, 3990)

In [25]:
demo_matrix_copy = get_matrix('demo_matrix.npy')
demo_matrix_copy.shape

(189559, 2)

In [26]:
def get_cust_cat(cursor,items):
    sql = """select max(versions) from (
          select nodeid,count(*) as versions
          from categories group by nodeid
          ) as temp"""
    
    cursor.execute(sql)
    vrs = cursor.fetchall()
    maxval = vrs[0][0]
    
    sql = """select distinct pr.asin, g.nodeid,level_1,level_2,level_3,level_4,level_5
          FROM customers c, orders o, orderlines ol, products pr, categories g
          WHERE c.customerid = o.customerid
          and o.orderid = ol.orderid
          and ol.productid = pr.productid
          and pr.nodeid = g.nodeid
          order by pr.asin,g.nodeid,level_2,level_3,level_4,level_5"""

    cursor.execute(sql)
    results = cursor.fetchall()
    lresults = len(results)
    m = np.zeros((maxval,len(items),5))    
    count = 0
        
    for i in range(len(results)):
        asin = results[i][0]

        m[count][items.index(asin)] = results[i][2:]
        count += 1
        
        if count == maxval:
            count = 0
                        
    return m

In [27]:
cat_matrix = get_cust_cat(localconn, items)

In [28]:
cat_matrix.shape

(7, 3990, 5)

In [29]:
np.save('cat_matrix.npy', cat_matrix)

In [30]:
cat_matrix_copy = get_matrix('cat_matrix.npy')
cat_matrix_copy.shape

(7, 3990, 5)