<a href="https://colab.research.google.com/github/akshayg03/ActiveLearning/blob/master/MLAss2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import matplotlib.pyplot as plt
import math
import statistics
from scipy import stats

from sklearn import datasets
from sklearn.semi_supervised import LabelSpreading
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score 
from sklearn import svm
from scipy.stats import entropy
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier

In [0]:
digits = datasets.load_digits()
rng = np.random.RandomState(1)
indices = np.arange(len(digits.data))
rng.shuffle(indices)
print("The total number of training examples in our dataset are: " + str(len(indices)))

In [0]:
#1a - Active Learning setup.
X = digits.data[indices[:500]]
y = digits.target[indices[:500]]
 
n_labeled_points = 50
 
X_train = np.copy(X[:n_labeled_points])
y_train = np.copy(y[:n_labeled_points])
 
X_test = np.copy(X[50:])
y_test = np.copy(y[50:])
model = svm.SVC(decision_function_shape='ovo',probability=True,C=1.0,  kernel='rbf')
model.fit(X_train,y_train)
 
predicted_labels = model.predict(X_test)
  
true_labels = y_test
 
cm = confusion_matrix(true_labels, predicted_labels)
print(cm)
print(accuracy_score(true_labels, predicted_labels))

In [0]:
# 1b1a - Uncertainity Sampling(Least Confident)
X = digits.data[indices[:1000]]
y = digits.target[indices[:1000]]
images = digits.images[indices[:1000]]

n_total_samples = len(y)
n_labeled_points = 100

X_train = np.copy(X[:100])
y_train = np.copy(y[:100])

X_test = np.copy(X[100:])
y_test = np.copy(y[100:])
model = svm.SVC(decision_function_shape='ovo',probability=True,C=1.0, kernel='rbf')
model.fit(X_train,y_train)
 
predicted_labels = model.predict(X)

true_labels = y
print(accuracy_score(true_labels, predicted_labels))

for i in range(4):
    
    predicted_prob = model.predict_proba(X_test)
    uncertainity_labels = 1 - predicted_prob.max(axis=1)
    uncertainity_label_indices = np.argsort(uncertainity_labels)
    uncertainity_label_indices = uncertainity_label_indices[:100]
    #print(uncertainity_label_indices)
    #print(len(uncertainity_label_indices))

    X_train = np.concatenate((X_train,X_test[uncertainity_label_indices]))
    y_train = np.concatenate((y_train,y_test[uncertainity_label_indices]))

    X_test = np.delete(X_test,uncertainity_label_indices,0)
    y_test = np.delete(y_test,uncertainity_label_indices)

    model.fit(X_train,y_train)

    predicted_labels = model.predict(X)
    true_labels = y

    #cm = confusion_matrix(true_labels, predicted_labels)
    #print(cm)
    print(accuracy_score(true_labels, predicted_labels))


In [0]:
#1b1b - Uncertainity Sampling(Margin Sampling)
X = digits.data[indices[:1000]]
y = digits.target[indices[:1000]]
images = digits.images[indices[:1000]]

n_total_samples = len(y)
n_labeled_points = 100

X_train = np.copy(X[:100])
y_train = np.copy(y[:100])

X_test = np.copy(X[100:])
y_test = np.copy(y[100:])
model = svm.SVC(decision_function_shape='ovo',probability=True,C=1.0, kernel='rbf')
model.fit(X_train,y_train)
 
predicted_labels = model.predict(X)

true_labels = y
print(accuracy_score(true_labels, predicted_labels))

for i in range(4):
    
    predicted_prob = model.predict_proba(X_test)
    part = np.partition(-predicted_prob, 1, axis=1)
    uncertainity_labels = - part[:, 0] + part[:, 1]
    uncertainity_label_indices = np.argsort(uncertainity_labels)
    uncertainity_label_indices = uncertainity_label_indices[:100]

   
    #print(uncertainity_label_indices)
    #print(len(uncertainity_label_indices))

    X_train = np.concatenate((X_train,X_test[uncertainity_label_indices]))
    y_train = np.concatenate((y_train,y_test[uncertainity_label_indices]))

    X_test = np.delete(X_test,uncertainity_label_indices,0)
    y_test = np.delete(y_test,uncertainity_label_indices)

    model.fit(X_train,y_train)

    predicted_labels = model.predict(X)
    true_labels = y

    # cm = confusion_matrix(true_labels, predicted_labels)
    # print(cm)
    print(accuracy_score(true_labels, predicted_labels))


In [0]:
#1b1c - Uncertainity Sampling(Entropy)
X = digits.data[indices[:1000]]
y = digits.target[indices[:1000]]
images = digits.images[indices[:1000]]

n_total_samples = len(y)
n_labeled_points = 100

X_train = np.copy(X[:100])
y_train = np.copy(y[:100])

X_test = np.copy(X[100:])
y_test = np.copy(y[100:])
model = svm.SVC(decision_function_shape='ovo',probability=True,C=1.0, kernel='rbf')
model.fit(X_train,y_train)
 
predicted_labels = model.predict(X)

true_labels = y
print(accuracy_score(true_labels, predicted_labels))

for i in range(4):
    
    predicted_prob = model.predict_proba(X_test)
    uncertainity_labels = entropy(predicted_prob.T)
    uncertainity_label_indices = np.argsort(uncertainity_labels)
    uncertainity_label_indices = uncertainity_label_indices[:100]

   
    #print(uncertainity_label_indices)
    #print(len(uncertainity_label_indices))

    X_train = np.concatenate((X_train,X_test[uncertainity_label_indices]))
    y_train = np.concatenate((y_train,y_test[uncertainity_label_indices]))

    X_test = np.delete(X_test,uncertainity_label_indices,0)
    y_test = np.delete(y_test,uncertainity_label_indices)

    model.fit(X_train,y_train)

    predicted_labels = model.predict(X)
    true_labels = y

    # cm = confusion_matrix(true_labels, predicted_labels)
    # print(cm)
    print(accuracy_score(true_labels, predicted_labels))


In [0]:
#1b2a - Query by committee(Vote Entropy)
X = digits.data[indices[:1000]]
y = digits.target[indices[:1000]]
images = digits.images[indices[:1000]]

n_total_samples = len(y)
n_labeled_points = 100

X_train = np.copy(X[:100])
y_train = np.copy(y[:100])

X_test = np.copy(X[100:])
y_test = np.copy(y[100:])
p = np.arange(4500).reshape(900,5)


r1 = np.random.RandomState(1)
r2 = np.random.RandomState(2)
indices1_QBC = np.arange(len(X_train))
indices2_QBC = np.arange(len(X_test))

X_QBC_train = np.random.rand(5,50,64)
y_QBC_train = np.random.rand(5,50)
X_QBC_test = np.random.rand(5,450,64)
y_QBC_test = np.random.rand(5,450)

for i in range(5):

    r1.shuffle(indices1_QBC)
    r2.shuffle(indices2_QBC)
    X_QBC_train[i] = X_train[indices1_QBC[:50]]
    y_QBC_train[i] = y_train[indices1_QBC[:50]]
    X_QBC_test[i] = X_test[indices2_QBC[:450]]
    y_QBC_test[i] = y_test[indices2_QBC[:450]]




for _ in range(5):
    c1 = svm.SVC(decision_function_shape='ovo',probability=True,C=1.0, kernel='rbf')
    c1.fit(X_QBC_train[0],y_QBC_train[0])
    c2 = svm.SVC(decision_function_shape='ovo',probability=True,C=1.0, kernel='rbf')
    c2.fit(X_QBC_train[1],y_QBC_train[1])
    c3 = svm.SVC(decision_function_shape='ovo',probability=True,C=1.0, kernel='rbf')
    c3.fit(X_QBC_train[2],y_QBC_train[2])
    c4 = svm.SVC(decision_function_shape='ovo',probability=True,C=1.0, kernel='rbf')
    c4.fit(X_QBC_train[3],y_QBC_train[3])
    c5 = svm.SVC(decision_function_shape='ovo',probability=True,C=1.0, kernel='rbf')
    c5.fit(X_QBC_train[4],y_QBC_train[4])
    print(c1.score(X_test,y_test),c2.score(X_test,y_test),c3.score(X_test,y_test),c4.score(X_test,y_test),c5.score(X_test,y_test))
    p[:,0] = c1.predict(X_test)
    p[:,1] = c2.predict(X_test)
    p[:,2] = c3.predict(X_test)
    p[:,3] = c4.predict(X_test)
    p[:,4] = c5.predict(X_test)
    Vote = np.arange(10*900).reshape(900,10)
    Vote[:,:] = 0

    for i in range(len(y_test)):
        for j in range(5):
            Vote[i,p[i,j]-1]+=1

    Prob = np.divide(Vote,5)

    uncertainity_labels = entropy(Prob.T)
    uncertainity_label_indices = np.argsort(uncertainity_labels)
    uncertainity_label_indices = uncertainity_label_indices[:100]

    X_train = np.concatenate((X_train,X_test[uncertainity_label_indices]))
    y_train = np.concatenate((y_train,y_test[uncertainity_label_indices]))

    for i in range(5):
        r1 = np.random.RandomState(i)
        r2 = np.random.RandomState(i*i)
        r1.shuffle(indices1_QBC)
        r2.shuffle(indices2_QBC)
        #r.shuffle(indices1_QBC)
        #r.shuffle(indices2_QBC)
        X_QBC_train[i] = X_train[indices1_QBC[:50]]
        y_QBC_train[i] = y_train[indices1_QBC[:50]]
        X_QBC_test[i] = X_test[indices2_QBC[:450]]
        y_QBC_test[i] = y_test[indices2_QBC[:450]]

    model = svm.SVC(decision_function_shape='ovo',probability=True,C=1.0, kernel='rbf')
    model.fit(X_train,y_train)
    
    predicted_labels = model.predict(X)

    true_labels = y
    print(accuracy_score(true_labels, predicted_labels))



In [0]:
#1b2b - Query by committee(KL Divergence)
X = digits.data[indices[:1000]]
y = digits.target[indices[:1000]]
images = digits.images[indices[:1000]]

n_total_samples = len(y)
n_labeled_points = 100

X_train = np.copy(X[:100])
y_train = np.copy(y[:100])

X_test = np.copy(X[100:])
y_test = np.copy(y[100:])
p = np.arange(4500).reshape(900,5)


r1 = np.random.RandomState(1)
r2 = np.random.RandomState(2)
indices1_QBC = np.arange(len(X_train))
indices2_QBC = np.arange(len(X_test))

X_QBC_train = np.random.rand(5,50,64)
y_QBC_train = np.random.rand(5,50)
X_QBC_test = np.random.rand(5,450,64)
y_QBC_test = np.random.rand(5,450)

for i in range(5):

    r1.shuffle(indices1_QBC)
    r2.shuffle(indices2_QBC)
    X_QBC_train[i] = X_train[indices1_QBC[:50]]
    y_QBC_train[i] = y_train[indices1_QBC[:50]]
    X_QBC_test[i] = X_test[indices2_QBC[:450]]
    y_QBC_test[i] = y_test[indices2_QBC[:450]]




for _ in range(5):
    c1 = svm.SVC(decision_function_shape='ovo',probability=True,C=1.0, kernel='rbf')
    c1.fit(X_QBC_train[0],y_QBC_train[0])
    c2 = svm.SVC(decision_function_shape='ovo',probability=True,C=1.0, kernel='rbf')
    c2.fit(X_QBC_train[1],y_QBC_train[1])
    c3 = svm.SVC(decision_function_shape='ovo',probability=True,C=1.0, kernel='rbf')
    c3.fit(X_QBC_train[2],y_QBC_train[2])
    c4 = svm.SVC(decision_function_shape='ovo',probability=True,C=1.0, kernel='rbf')
    c4.fit(X_QBC_train[3],y_QBC_train[3])
    c5 = svm.SVC(decision_function_shape='ovo',probability=True,C=1.0, kernel='rbf')
    c5.fit(X_QBC_train[4],y_QBC_train[4])
    print(c1.score(X_test,y_test),c2.score(X_test,y_test),c3.score(X_test,y_test),c4.score(X_test,y_test),c5.score(X_test,y_test))
    
    p[:,0] = c1.predict(X_test)
    p[:,1] = c2.predict(X_test)
    p[:,2] = c3.predict(X_test)
    p[:,3] = c4.predict(X_test)
    p[:,4] = c5.predict(X_test)
    Vote = np.arange(10*900).reshape(900,10)
    Vote[:,:] = 0

    for i in range(len(y_test)):
        for j in range(5):
            Vote[i,p[i,j]-1]+=1

    Prob = np.divide(Vote,5)
    p_consensus = np.random.rand(len(Prob),10)
    p_consensus[:,:] = 0.1
    uncertainity_labels = entropy(Prob.T,qk=p_consensus.T)
    uncertainity_label_indices = np.argsort(uncertainity_labels)
    uncertainity_label_indices = uncertainity_label_indices[:100]

    X_train = np.concatenate((X_train,X_test[uncertainity_label_indices]))
    y_train = np.concatenate((y_train,y_test[uncertainity_label_indices]))

    for i in range(5):
        r1 = np.random.RandomState(i)
        r2 = np.random.RandomState(i*i)
        r1.shuffle(indices1_QBC)
        r2.shuffle(indices2_QBC)
        X_QBC_train[i] = X_train[indices1_QBC[:50]]
        y_QBC_train[i] = y_train[indices1_QBC[:50]]
        X_QBC_test[i] = X_test[indices2_QBC[:450]]
        y_QBC_test[i] = y_test[indices2_QBC[:450]]

    model = svm.SVC(decision_function_shape='ovo',probability=True,C=1.0, kernel='rbf')
    model.fit(X_train,y_train)
    
    predicted_labels = model.predict(X)

    true_labels = y
    print(accuracy_score(true_labels, predicted_labels))



In [0]:
#1b3 - Version Space and its points indices.
version_indices =[]
for i in range(len(Vote)):
    if(Vote[i,np.argmax(Vote[i])]!=5):
        version_indices.append(i)
print("The size/number of points in of verson space is: " + str(len(version_indices)))
version_vote = Vote[version_indices]

version_unique_count = []
for i in range(len(version_vote)):
    version_unique_count.append(len(np.unique(version_vote[i])))
version_unique_count = np.multiply(version_unique_count,-1)
version_sort_indices = np.argsort(version_unique_count)

In [0]:
#1b4 - Random points approach.
X = digits.data[indices[:1000]]
y = digits.target[indices[:1000]]
images = digits.images[indices[:1000]]

n_total_samples = len(y)
n_labeled_points = 100

model = svm.SVC(decision_function_shape='ovo',probability=True,C=1.0, kernel='rbf')

for i in range(5):
    X_train = X[:50*(i+1)]
    y_train = y[:50*(i+1)]

    X_test = X[50*(i+1):]
    y_test = y[50*(i+1):]

    model.fit(X_train,y_train)

    predicted_labels = model.predict(X)
    true_labels = y

    print(accuracy_score(true_labels, predicted_labels))

In [0]:
#1b4b - Stream based approach.
X = digits.data[indices[:500]]
y = digits.target[indices[:500]]
images = digits.images[indices[:500]]

n_total_samples = len(y)
n_labeled_points = 50

X_train = np.copy(X[:50])
y_train = np.copy(y[:50])

X_test = np.copy(X[50:])
y_test = np.copy(y[50:])
model = svm.SVC(decision_function_shape='ovo',probability=True,C=1.0, kernel='rbf')
model.fit(X_train,y_train)
 
predicted_labels = model.predict(X)

true_labels = y
print(accuracy_score(true_labels, predicted_labels))

 
predicted_prob = model.predict_proba(X_test)

a = []
for i in range(len(predicted_prob)):
    a.append(predicted_prob[i][np.argmax(predicted_prob[i])])
a = np.array(a) 
ind = np.argsort(a,axis=0)

for i in range(4):
  X_train = np.concatenate((X_train,X_test[ind[400-i*50:]]))
  y_train = np.concatenate((y_train,y_test[ind[400-i*50:]]))
  
  model.fit(X_train,y_train)
 
  predicted_labels = model.predict(X)
  true_labels = y
  print(accuracy_score(true_labels, predicted_labels))

In [0]:
#1b5 - K-means.
X = digits.data[indices[:500]]
y = digits.target[indices[:500]]
images = digits.images[indices[:500]]

x_labeled_10 = np.copy(X[:50])
y_labeled_10 = np.copy(y[:50])

x_unlabeled_90 = np.copy(X[50:])
y_unlabeled_90 = np.copy(y[50:])

x_random_40 = np.copy(x_unlabeled_90[:200])
y_random_40 = np.copy(y_unlabeled_90[:200])

k_means = x_random_40[:10]
k_means_cur = x_random_40[20:30]

money = 0
time = 0
clusters = np.zeros(200,dtype = int)
y_final = np.zeros(200,dtype = int)
distances = np.zeros(10)
flag = False

for i in range(10000):
  # print(k_means)
  for j in range(200):
    for k in range(10):
      distances[k] = np.linalg.norm(x_random_40[j] - k_means[k])
    c = np.argmin(distances)
    clusters[j] = c

  for j in range(10):
    c = np.where(clusters == j)
    s = np.average(x_random_40[c], axis = 0)
    k_means_cur[j] = s

  flag = np.array_equal(k_means_cur,k_means)
  if(flag):
    break
  k_means = np.copy(k_means_cur)

for j in range(10):
    c = np.where(clusters == j)
    len_1 = int(0.2*len(c[0])+1)
    labeled = y_random_40[c[0][:len_1]]
    money += len(labeled) * 100
    time += len(labeled)

    try:
      most_freq = statistics.mode(labeled)
    except statistics.StatisticsError:
      most_freq = labeled[0]
    
    y_final[c] = most_freq


cm = confusion_matrix(y_final, y_random_40)
print(cm)
print("the accuracy is : ",accuracy_score(y_final, y_random_40))
print("it would take ", money, "rupees \nand it would take ", time, "hours to label")

Question 2

In [0]:
#2 - Self Organizing Maps(SOMs)
!pip install SimpSOM

In [0]:
import pandas as pd
import SimpSOM as sps
import numpy as np
from sklearn.datasets import load_wine

In [0]:
dataset = load_wine()
x = dataset.data
y = dataset.target

In [0]:
net = sps.somNet(10, 10, x, PBC=True)
net.train(0.01, 20000)

In [0]:
net.diff_graph()

In [0]:
net.nodes_graph(colnum=0)

In [0]:
net.nodes_graph(colnum=3)

In [0]:
net.nodes_graph(colnum=9)