In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import copy
from matplotlib import pyplot as plt

maxlength = 3

# each markov entry: state input -> (int total number, dict articles -> purchase count)
def buildMarkovSequential(customerTransactions, n, markov):
	print("Sequential {}".format(n))
	keys = customerTransactions.keys()
	for key in keys:
		train = customerTransactions[key][:-1]
		#print(len(train))
		tempInput = [0] * n
		tupInput = tuple(tempInput)
		if n == 1:
			tupInput = (0,)
		for j in range(len(train)):
			if tupInput not in markov:
				markov[tupInput] = [0, {}]
			markov[tupInput][0] += 1
			articleCount = markov[tupInput][1]
			if train[j] not in articleCount:
				articleCount[train[j]] = 0
			articleCount[train[j]] += 1

			if n == 1:
				tupInput = (train[j],)
			else:
				tempInput = tempInput[1:]
				tempInput.append(train[j])
				tupInput = tuple(tempInput)
			#print(tupInput)

def buildMarkovSet(customerTransactions, n, markov):
	print("Set {}".format(n))
	keys = customerTransactions.keys()
	for key in keys:
		train = customerTransactions[key][:-1]
		tempInput = [0] * n
		setInput = tempInput
		tupInput = tuple(setInput)
		if n == 1:
			tupInput = (0,)
		for j in range(len(train)):
			if tupInput not in markov:
				markov[tupInput] = [0, {}]
			markov[tupInput][0] += 1
			articleCount = markov[tupInput][1]
			if train[j] not in articleCount:
				articleCount[train[j]] = 0
			articleCount[train[j]] += 1

			if n == 1:
				tupInput = (train[j],)
			else:
				tempInput = tempInput[1:]
				tempInput.append(train[j])
				setInput = copy.deepcopy(tempInput)
				setInput.sort()
				tupInput = tuple(setInput)

#build dictionary of customer transactions
print("Loading Customer Transactions")

transactions = pd.read_csv('transactions25short.csv')

print("Clustering")

customerTransactions = {}
for i in range(len(transactions.index)):
	if i % 100000 == 0:
		print(i)
	cust = transactions.at[i, 'customer_id']
	if cust not in customerTransactions:
		customerTransactions[cust] = []
	customerTransactions[cust].append(transactions.at[i, 'article_id'])
customerIDs = customerTransactions.keys()

print("Customer Transaction Data Loaded {}".format(len(customerTransactions.keys())))

#build series of markov processes up to n = 10?
#empty dictionary at pos 0
sequenceMarkovs = [{}]*maxlength
setMarkovs = [{}]*maxlength
print("Building Markovs")
for i in range(1, maxlength):
	#buildMarkovSequential(customerTransactions, i, sequenceMarkovs[i])
	buildMarkovSet(customerTransactions, i, setMarkovs[i])

def valFunc(keyValPair):
	return keyValPair[1]


# testing, predict last entry


#each entry in Scores list: [success, fail, invalidInput]


sequenceMarkovScores = [[0, 0, 0]]
sequenceMarkovPredictions = [[]]
print("Customers {}".format(len(customerIDs)))
for i in range(1, maxlength):
	#test sequence markov
	print("Testing Sequence Markov of input size {}".format(i))
	seqMarkov = sequenceMarkovs[i]
	sequenceMarkovScores.append([0, 0, 0])
	sequenceMarkovPredictions.append([])
	for customer in customerIDs:
		history = customerTransactions[customer]
		test = history[-1]
		history = history[:-1]
		query = []
		if len(history) < i:
			query = ([0]*(i-len(history)))
			if len(history) != 0:
				query.extend(history)
		else:
			query = history[-i:]
		queryTup = tuple(query)

		#eval seqMarkov
		if queryTup not in seqMarkov:
			sequenceMarkovScores[i][2] += 1
			sequenceMarkovPredictions[i].append((customer, ""))
		else:
			#get predictions and test against test value
			predictions = list(seqMarkov[queryTup][1].items())
			#print(predictions)
			predictions.sort(key=valFunc, reverse=True)
			predictions = predictions[:12]
			match = 0
			for j in range(len(predictions)):
				if predictions[j][0] == test:
					match = 1
			if match == 0:
				sequenceMarkovScores[i][1] += 1
			else:
				sequenceMarkovScores[i][0] += 1

			predictionString = ""
			for article in predictions:
				predictionString += str(article[0]) + " "
			sequenceMarkovPredictions[i].append((customer, predictionString))

setMarkovScores = [[0, 0, 0]]
setMarkovPredictions = [[]]
for i in range(1, maxlength):
	#test sequence markov
	print("Testing Set Markov of input size {}".format(i))
	setMarkov = setMarkovs[i]
	setMarkovScores.append([0, 0, 0])
	setMarkovPredictions.append([])
	for customer in customerIDs:
		history = customerTransactions[customer]
		test = history[-1]
		history = history[:-1]
		query = []
		if len(history) < i:
			query = ([0]*(i-len(history)))
			if len(history) != 0:
				query.extend(history)
		else:
			query = history[-i:]
		setQuery = copy.deepcopy(query)
		setQuery.sort()
		setQueryTup = tuple(setQuery)
		#eval setMarkov
		if setQueryTup not in setMarkov:
			setMarkovScores[i][2] += 1
			setMarkovPredictions[i].append((customer, ""))
		else:
			#get predictions and test against test value
			predictions = list(setMarkov[setQueryTup][1].items())
			predictions.sort(key=valFunc, reverse=True)
			predictions = predictions[:12]
			match = 0
			for j in range(len(predictions)):
				if predictions[j][0] == test:
					match = 1
			if match == 0:
				setMarkovScores[i][1] += 1
			else:
				setMarkovScores[i][0] += 1

			predictionString = ""
			for article in predictions:
				predictionString += str(article[0]) + " "
			setMarkovPredictions[i].append((customer, predictionString))


for i in range(1, maxlength):
	predictionList = pd.DataFrame(data=sequenceMarkovPredictions[i], columns=['customer_id', 'prediction'])
	predictionList.to_csv('markovPredictions/seqMarkovPredictions{}.csv'.format(i))
	predictionList = pd.DataFrame(data=setMarkovPredictions[i], columns=['customer_id', 'prediction'])
	predictionList.to_csv('markovPredictions/setMarkovFullPredictions{}.csv'.format(i))

file = open('markovResultsFull.txt', 'w')
file.write("Sequence Markov Results:\n")
for i in range(1,maxlength):
	file.write("inputLength: {}  | predicted: {}  | missed: {}  | badInput: {}\n".format(i, sequenceMarkovScores[i][0], sequenceMarkovScores[i][1], sequenceMarkovScores[i][2]))
file.write("\nSet Markov Results:\n")
for i in range(1,maxlength):
	file.write("inputLength: {}  | predicted: {}  | missed: {}  | badInput: {}\n".format(i, setMarkovScores[i][0], setMarkovScores[i][1], setMarkovScores[i][2]))
file.close()







