In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [None]:
class Graph:
    def __init__(self, arm = 5):
        self.k = arm
        self.n_i = [[] for i in range(self.k)]
        self.reward_t = []
        self.regret = []

class Bandit:
    def __init__(self, arm = 5):
        self.k = arm
        self.prob = []
        self.s_i = [0.0 for i in range(arm)]
        self.n_i = [0.0 for i in range(arm)]
        self.graph = Graph(arm)
        
    def initializeProbability(self):
        for arm in range(self.k):
            p = np.random.rand()
            self.prob.append(p)
            
    def ucb(self, arm, t):
        mu = self.s_i[arm]/self.n_i[arm]
        confInterval = np.sqrt((2.0*np.log(t))/self.n_i[arm])
        
        return mu + confInterval
    
    def selectArm(self, t):
        ucb_t = [self.ucb(arm, t) for arm in range(self.k)]
        
        return np.argmax(ucb_t)
            
    def reward(self, arm):
        if(np.random.rand() < self.prob[arm]):
            return 1.0
        return 0.0
    
    def optimalArm(self):
        mu = [self.s_i[i]/self.n_i[i] for i in range(self.k)]
        
        return np.argmax(mu)
    
    def getMU(self, arm):
        return self.s_i[arm]/self.n_i[arm]
    
    def plotUCB(self):
        pass
    
    def plotN_div_T(self, arm):
        pass
    
    def updateGraph(self, totReward, selectedArm):
        self.graph.reward_t.append(totReward)
        for arm in range(self.k):
            if(arm == selectedArm):
                if(len(self.graph.n_i[arm]) == 0):
                    self.graph.n_i[arm].append(1)
                else:
                    self.graph.n_i[arm].append(self.graph.n_i[arm][-1] + 1)
            else:
                if(len(self.graph.n_i[arm]) == 0):
                    self.graph.n_i[arm].append(1)
                else:
                    self.graph.n_i[arm].append(self.graph.n_i[arm][-1])
                   
        self.graph.regret.append(np.max(self.prob) - self.prob[selectedArm])
               
    
#     def plot

In [None]:
bandit = Bandit(arm = 5)
bandit.initializeProbability()
print(bandit.prob)
plt.bar(["arm" + i for i in list(map(str, np.arange(bandit.k).tolist()))], 
       bandit.prob)
plt.ylabel("probability")
plt.title("Arm Distribution(Bernouli)")

In [None]:
##Initial round
totReward = 0
for arm in range(bandit.k):
    reward = bandit.reward(arm)
    bandit.s_i[arm] = bandit.s_i[arm] + reward
    bandit.n_i[arm] = bandit.n_i[arm] + 1
    
    totReward += reward
    bandit.updateGraph(totReward, arm)

#Update
rounds = 10000
for t in range(1, rounds):
    arm = bandit.selectArm(t)
    reward = bandit.reward(arm)
    bandit.s_i[arm] = bandit.s_i[arm] + reward
    bandit.n_i[arm] = bandit.n_i[arm] + 1.0
    
    totReward = totReward + reward
    bandit.updateGraph(totReward, arm)

In [None]:
optimalArm = bandit.optimalArm()
optimalMU = bandit.getMU(optimalArm)

print("Optimal Arm = ", optimalArm, " u* = ", optimalMU)

In [None]:
print("Reward collected till {0} rounds from arm_i".format(rounds), bandit.s_i)
print("No of times arm_i was pulled in {0} rounds".format(rounds), bandit.n_i)

### Plot1: 
##### (a) plot of N_i/t vs t i.e fraction of time arm i was selected in interval 1 to t

In [None]:
fig = plt.figure(figsize=(10, 10))
ax1 = fig.add_subplot(111, ylabel = "N_arm/t", xlabel = "t", title = "N_t/t vs t")
for arm in range(bandit.k):
    ax1.plot([n/t for n in bandit.graph.n_i[arm] for t in range(1, rounds+1)])

### Plot2: 
##### (a) Plot of Reward(t) vs t
##### (b) Plot of Reward(t)/t vs t

### Plot3: 
##### (a) Regret(t) vs t 
##### (b) Regret(t)/t vs t