# Question 1:

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

def findProductInfo(dataSet, selectedProduct):
    index = np.random.randint(1,498)
    selectedProductPoint = 'Product ' + str(selectedProduct+1)
    selectedProductCost = 'cost '+ str(selectedProduct+1)
    return dataSet[selectedProductPoint][index], dataSet[selectedProductCost][index]


def findProductValue(dataSet, selectedProduct):
    productPoint, productCost = findProductInfo(dataSet, selectedProduct)
    return (2.5 * productPoint - productCost)

def isInInterval(interval, num):
    if (num > interval[0]) and (num < interval[1]):
        return True
    elif (num == interval[0]) or (num == interval[1]):
        return True
    return False


class RLComp(object):
    def __init__(self, __alpha, __beta, __policy, __policyPar, numArms, __dataSet):
        self.numArms = numArms
        self.alpha = __alpha
        self.beta = __beta
        self.policy = __policy
        self.policyPar = __policyPar
        self.numTrials = 0
        self.numSelectedArms = [0] * numArms
        self.sampleRewards = [0] * numArms
        self.regret = []
        self.avgRegret = []
        self.prefArms = [0] * numArms
        self.probArms = [0] * numArms
        self.rBar = 0
        self.dataSet = __dataSet

        
    def updateArmPref(self, selectedArm, armValue):
        self.prefArms[selectedArm] = self.prefArms[selectedArm] + self.beta * (armValue - self.rBar)
    
    def updateRBar(self, armValue):
        self.rBar = self.rBar + self.alpha * (armValue - self.rBar)
        
    def updateArmProbEGreedy(self):
        ag = self.prefArms.index(max(self.prefArms))
        self.probArms[ag] = 1 - self.policyPar + (self.policyPar/self.numArms)
        
        for i in range(self.numArms):
            if not i == ag:
                self.probArms[i] = self.policyPar/self.numArms
                
    def updateArmProbSoftMax(self):
        p = self.prefArms
        tou = self.policyPar
        normalFactor = np.exp(p[0]/tou) + np.exp(p[1]/tou) + np.exp(p[2]/tou) + np.exp(p[3]/tou) + np.exp(p[4]/tou) + np.exp(p[5]/tou)

        for i in range(self.numArms):
            self.probArms[i] = np.exp(p[i]/tou)/normalFactor
            
    def updateArmProb(self):
        if self.policy == 'ep-greedy':
            self.updateArmProbEGreedy()
        else:
            self.updateArmProbSoftMax()
        
    def selectArm(self):
        pi = self.probArms
        arm1 = [0, pi[0]]
        arm2 = [pi[0], pi[0]+pi[1]]
        arm3 = [pi[0]+pi[1], pi[0]+pi[1]+pi[2]]
        arm4 = [pi[0]+pi[1]+pi[2], pi[0]+pi[1]+pi[2]+pi[3]]
        arm5 = [pi[0]+pi[1]+pi[2]+pi[3], pi[0]+pi[1]+pi[2]+pi[3]+pi[4]]
        arm6 = [pi[0]+pi[1]+pi[2]+pi[3]+pi[4], pi[0]+pi[1]+pi[2]+pi[3]+pi[4]+pi[5]]

        while True:
            randomAction = np.random.uniform()
            if isInInterval(arm1, randomAction):
                self.numSelectedArms[0] = self.numSelectedArms[0] + 1
                return 0
            elif isInInterval(arm2, randomAction):
                self.numSelectedArms[1] = self.numSelectedArms[1] + 1
                return 1
            elif isInInterval(arm3, randomAction):
                self.numSelectedArms[2] = self.numSelectedArms[2] + 1
                return 2
            elif isInInterval(arm4, randomAction):
                self.numSelectedArms[3] = self.numSelectedArms[3] + 1
                return 3
            elif isInInterval(arm5, randomAction):
                self.numSelectedArms[4] = self.numSelectedArms[4] + 1
                return 4
            elif isInInterval(arm6, randomAction):
                self.numSelectedArms[5] = self.numSelectedArms[5] + 1
                return 5
            
    def setPolicyPar(self, numIter):
        if self.policy == 'ep-greedy':
            self.policyPar = 1/(numIter+1)
        else:
            self.policyPar = 100
            
    def updateSampleRewards(self, selectedArm, armValue):
        self.sampleRewards[selectedArm] = self.sampleRewards[selectedArm] + armValue
            
    def calcAvgRegret(self, selectedArm):
        miuStar = self.sampleRewards[selectedArm] / self.numSelectedArms[selectedArm]
        temp = 0
        for i in range(6):
            if not self.numSelectedArms[i] == 0:
                miuJ = self.sampleRewards[i]/self.numSelectedArms[i]
                meanTJ = self.numSelectedArms[i]/self.numTrials
                temp = temp + miuJ * meanTJ
        
        self.avgRegret.append(miuStar*self.numTrials - temp)
        
    def calcRegret(self, selectedArm):
        miuStar = self.sampleRewards[selectedArm] / self.numSelectedArms[selectedArm]
        self.regret.append(miuStar*self.numTrials - sum(self.sampleRewards))
        
    def findBestArm(self):
        for j in range(100):
            selectedArm = np.random.randint(0,self.numArms)
            
            for i in range(100):
                self.numTrials = self.numTrials + 1
                self.setPolicyPar(i)
                self.updateArmProb()
        
                armValue = findProductValue(self.dataSet, selectedArm)
                self.updateArmPref(selectedArm, armValue)
                self.updateRBar(armValue)
                self.updateSampleRewards(selectedArm, armValue)
                selectedArm = self.selectArm()
                
                self.calcRegret(selectedArm)
            self.calcAvgRegret(selectedArm)
            if j == 0:   
                print("In first episode best product is:",selectedArm+1, '\nWith regret:(the horizontal axis is n)')
                plots = plt.plot([i for i in range(100)], self.regret)
                plt.setp(plots, 'color', 'orchid')
                plt.show()
                
                print("In first episode best product is:",selectedArm+1, '\nWith regret:(the horizontal axis is ln(n))')
                plots = plt.plot([np.log(i) for i in range(1,101)], self.regret)
                plt.setp(plots, 'color', 'orchid')
                plt.show()
      
        plot = plt.plot([np.log(i) for i in range(1,101)], self.avgRegret)
        plt.setp(plot, 'color', 'orchid')
        plt.show()

        selected = self.numSelectedArms.index(max(self.numSelectedArms))
        return ([selected+1])
    
dataSet = pd.read_csv(r'/home/atena/Desktop/ML/HW/2/Dataset.csv')    
myAlgo = RLComp(0.1, 0.9, 'soft-max', 1, 6, dataSet)
result = myAlgo.findBestArm()
print('Best product is: ' ,result[0])
        

# Question 2:

In [None]:
import numpy as np

taxiProp = ['Tap-Taxi', [0.1,5,3], 'Taxi-Nap', [0.3,2,2], 'Taxim', [0.7,1,1]]

def wasteTimeCalc(noTaxiProb, meanWasteTime, varWasteTime):
    findTaxiProb = np.random.uniform()
    if (findTaxiProb < noTaxiProb) or (findTaxiProb == noTaxiProb):
        return abs(np.random.normal(meanWasteTime, np.sqrt(varWasteTime), 1))
    else:
        return 0
        
def takeTaxi(sameTaxiProb, changeTaxiProb):
    totalWastedTime = 0
    selectedTaxi = np.random.randint(0,3)
    taxiName = selectedTaxi * 2
    allWastedTime = []
    
    for i in range(10):
        for j in range(1000):
            wastedTime = wasteTimeCalc(taxiProp[taxiName+1][0], taxiProp[taxiName+1][1], taxiProp[taxiName+1][2])
            totalWastedTime = totalWastedTime + wastedTime

            if wastedTime == 0:
                decisionProb = np.random.uniform() 
                if (decisionProb < sameTaxiProb) or (decisionProb == sameTaxiProb):
                    taxiName = taxiName

                else:
                    dummy = taxiName
                    while dummy == taxiName:
                        selectedTaxi = np.random.randint(0,3)
                        dummy = selectedTaxi * 2
                    taxiName = dummy
            else:
                decisionProb = np.random.uniform()
                if (decisionProb < changeTaxiProb) or (decisionProb == changeTaxiProb):
                    dummy = taxiName
                    while dummy == taxiName:
                        selectedTaxi = np.random.randint(0,3)
                        dummy = selectedTaxi * 2
                    taxiName = dummy
                else:
                    taxiName = taxiName
        allWastedTime.append(totalWastedTime)
                
    return (sum(allWastedTime)/10)[0]
                


print("Hercule Poirot's total wasted time is: ", takeTaxi(0.9, 0.9))
print("Miss Marple's total wasted time is: ", takeTaxi(0.9, 0.2))
print("Sherlock Holmes's total wasted time is: ", takeTaxi(0.3, 0.8))

# Question 3

In [None]:
import numpy as np
import matplotlib.pyplot as plt

class UCB2(object):
    
    def __init__(self, __alpha):
        self.alpha = __alpha
        self.pathsCount = [0] * 3
        self.qValues = [0] * 3
        self.pathsRetakeCount = [0] * 3
        self.currPath = 0
        self.allRetakes = 0
    
    def calcTou(self, path, increament):
        return np.ceil( (1+self.alpha) ** (self.pathsRetakeCount[path]+increament) )

    def calcBonus(self, path):
        tou = self.calcTou(path, 0)
        totalCount = sum(self.pathsCount)
        return np.sqrt((1+self.alpha)*np.log(np.exp(1)* totalCount / tou)/(2*tou))
    
    def setPath(self, path):
        self.currPath = path
        self.pathsRetakeCount[path] = self.pathsRetakeCount[path] + 1
        self.allRetakes = self.allRetakes + max(1, np.ceil(self.calcTou(path, 1) - self.calcTou(path, 0)))
        
    def setInitValue(self):
        for i in range(3):
            if self.pathsCount[i] == 0:
                self.setPath(i)
                return i
            else:
                return 4
            
    def endEpoch(self):
        if self.allRetakes > sum(self.pathsCount):
            return True
        return False
    
    def calcUtilityFunc(self):
        utilityFunc = [0] * 3
        for i in range(3):
            bonus = self.calcBonus(i)
            utilityFunc[i] = self.qValues[i] + bonus
        return utilityFunc
    
    def selectPathUtilityFunc(self):
        utilityFunc = self.calcUtilityFunc()
        return utilityFunc.index(max(utilityFunc))

    def choosePath(self):
        selectedPath = self.setInitValue()
        if selectedPath < 4:
            return selectedPath
        
        selectedPath = self.currPath
        if self.endEpoch():
            return selectedPath
        
        selectedPath = self.selectPathUtilityFunc()
        self.setPath(selectedPath)
        return selectedPath
    
    def updateQValue(self, selectedPath, pathOutcome):
        self.pathsCount[selectedPath] = self.pathsCount[selectedPath] + 1
        self.qValues[selectedPath] = self.qValues[selectedPath] + (1/self.pathsCount[selectedPath]) * (pathOutcome - self.qValues[selectedPath])
        return self.qValues[selectedPath]

In [None]:
def pathOne():
    cost = np.random.normal(2, np.sqrt(0.0625), 1)
    delay = np.random.normal(0, np.sqrt(0.25), 1)
    return cost, delay

def pathTwo():
    cost = np.random.normal(3.5, np.sqrt(0.25), 1)
    delay = np.random.uniform(-3, 0.5)
    return cost, delay

def pathThree():
    cost = np.random.uniform(3.5, 4.5)
    delay = np.random.normal(-2.5, np.sqrt(0.25), 1)
    return cost, delay

def outcome(cost, delay):
    if delay > 0 :
        return (delay * 1.5 + cost)
    elif delay < 0 : 
        return (cost + delay)

def getOutcome(pathIndex):
    cost = 0
    delay = 0
    if pathIndex == 0:
        cost, delay = pathOne()
    elif pathIndex == 1:
        cost, delay = pathTwo()
    elif pathIndex == 2:
        cost, delay = pathThree()
    else: 
        print('No outcome for this path!')
    
    return outcome(cost, delay)
    
def findBestWay():
    alpha = [0.9, 0.5, 0.1, 0.01, 0.001]
    pathsOutcomes = [0] * 3
    allSelectedPaths = []
    qValuesPath1 = []
    qValuesPath2 = []
    qValuesPath3 = []
    for i in range(5):
        myUCB = UCB2(alpha[i])
        
        for trial in range(10000):
            selectedPath = myUCB.choosePath()
            outcome = getOutcome(selectedPath)
            pathsOutcomes[selectedPath] = outcome
            qValue = myUCB.updateQValue(selectedPath, outcome)
            
            if selectedPath == 0:
                qValuesPath1.append(qValue)
            elif selectedPath == 1:
                qValuesPath2.append(qValue)
            elif selectedPath == 2:
                qValuesPath3.append(qValue)
                
        allSelectedPaths.append(selectedPath+1)
        
        plotQ1, = plt.plot([i for i in range(len(qValuesPath1))], qValuesPath1)
        plt.setp(plotQ1, 'color', 'orchid')
        
        plotQ2, = plt.plot([i for i in range(len(qValuesPath2))], qValuesPath2)
        plt.setp(plotQ2, 'color', 'blue')
        
        plotQ3, = plt.plot([i for i in range(len(qValuesPath3))], qValuesPath3)
        plt.setp(plotQ3, 'color', 'green')
        
        plt.title('Alpha is:' + str(alpha[i]))
        plt.legend([plotQ1, plotQ2, plotQ3], ['Subway', 'Bus', 'Taxi'])
        plt.show()
    return allSelectedPaths


print('Best path is:' ,findBestWay())