# Semi-supervised Learning

## Load the dataset - Reading the input data

In [156]:
from sklearn.neighbors import KernelDensity
from sklearn.cross_validation import train_test_split
import numpy as np
import scipy.io
mat = scipy.io.loadmat('oc_514.mat')
train = mat['x']
xtrain = train[0,0][0]

y= train[0,0][2]
# print y.shape

In [157]:
# Divide the dataset randomly in half, 50% for the training set and 50% for the test set.
from sklearn import preprocessing
X_normalized = preprocessing.normalize(xtrain, norm='l2')
XTrain, XTest, YTrain, YTest = train_test_split(X_normalized, y, test_size=0.5, random_state=42)
# print XTrain.shape
# print YTrain.shape
# print xtrain[0]
# print XTrain[0]

In [158]:
import random
def selectRandomLabels(percentage, XTrain, YTrain):
    rmax = len(XTrain)*percentage/100
    idx = random.sample(range(0,len(XTrain)), rmax)
    labelDS = []
    labelY = []
    unlabelDS = []
                 
    for i in range(0, len(XTrain)):
        if i in idx:
            labelDS.append(XTrain[i,:])
            labelY.append(YTrain[i])
        else:
            unlabelDS.append(XTrain[i,:])
        
    return labelDS, labelY, unlabelDS


In [159]:
import random
def makeRandomLabelsUnlabeled(percentage, YTrain):
    rmax = len(YTrain)*(100-percentage)/100    
    random_unlabeled_points = np.where(np.random.random_integers(0, 1, size=rmax))
    newlabelY = np.copy(YTrain)
    newlabelY[random_unlabeled_points] = -1
    return newlabelY


## Label propagation from scikit-learn

###  a) Use label propagation from scikit-learn to test the change in accuracy w.r.t. the percent of labelled data. Repeat the test 100 times and show average results.

In [160]:
from sklearn.semi_supervised import LabelPropagation
def sklearnLabelPropogation(percentage, XTrain, YTrain,XTest,YTest):
    label_prop_model = LabelPropagation(gamma=100)
    labels = makeRandomLabelsUnlabeled(percentage, YTrain)
#     print labels.shape
#     print XTrain.shape
#     YPred = label_prop_model.fit(XTrain, np.ravel(labels)).predict_proba(XTest)
    YPred = label_prop_model.fit(XTrain, np.ravel(labels)).predict(XTest)
    matches = 0
    totalLabels = YPred.shape[0]
    for i in xrange(totalLabels):
        matches += YTest[i]==YPred[i]
    totalUnlabelled = YPred[YPred>=255].size
    unlabelledProportion = float(totalUnlabelled)/totalLabels * 100
    matchingLabels = float(matches)/totalLabels *100
    return unlabelledProportion,matchingLabels

## Try to randomly select labels so that labeled examples constitute:
## a) 10%

In [161]:
percentage = 10
# labelDS, labelY, unlabelDS = selectRandomLabels(percentage, XTrain, YTrain)    
matches = 0.0
unlabelProp = 0.0
for iteration in xrange(100):
    unlabelledProportion,matchingLabels = sklearnLabelPropogation(percentage, XTrain, YTrain, XTest, YTest)
    matches += matchingLabels
    unlabelProp += unlabelledProportion
print 'Average Proportion of matches: ',matches/100
print 'Average Proportion of Labelled Outputs: ',100-unlabelProp/100

Average Proportion of matches:  22.8047619048
Average Proportion of Labelled Outputs:  37.419047619


## Try to randomly select labels so that labeled examples constitute:
## a) 30%

In [162]:
percentage = 30
# labelDS, labelY, unlabelDS = selectRandomLabels(percentage, XTrain, YTrain)
matches = 0.0
unlabelProp = 0.0
for iteration in xrange(100):
    unlabelledProportion,matchingLabels = sklearnLabelPropogation(percentage, XTrain, YTrain, XTest, YTest)
    matches += matchingLabels
    unlabelProp += unlabelledProportion
print 'Average Proportion of matches: ',matches/100
print 'Average Proportion of Labelled Outputs: ',100-unlabelProp/100

Average Proportion of matches:  48.3238095238
Average Proportion of Labelled Outputs:  77.3142857143


## Try to randomly select labels so that labeled examples constitute:
## a) 50%

In [163]:
percentage = 50
# labelDS, labelY, unlabelDS = selectRandomLabels(percentage, XTrain, YTrain)
matches = 0.0
unlabelProp = 0.0
for iteration in xrange(100):
    unlabelledProportion,matchingLabels = sklearnLabelPropogation(percentage, XTrain, YTrain, XTest, YTest)
    matches += matchingLabels
    unlabelProp += unlabelledProportion
# print matches/100,unlabelProp/100
print 'Average Proportion of matches: ',matches/100
print 'Average Proportion of Labelled Outputs: ',100-unlabelProp/100

Average Proportion of matches:  58.2904761905
Average Proportion of Labelled Outputs:  91.919047619


## Try to randomly select labels so that labeled examples constitute:
## a) 100%

In [164]:
percentage = 100
# labelDS, labelY, unlabelDS = selectRandomLabels(percentage, XTrain, YTrain)
matches = 0.0
unlabelProp = 0.0
for iteration in xrange(100):
    unlabelledProportion,matchingLabels = sklearnLabelPropogation(percentage, XTrain, YTrain, XTest, YTest)
    matches += matchingLabels
    unlabelProp += unlabelledProportion
# print matches/100,unlabelProp/100
print 'Average Proportion of matches: ',matches/100
print 'Average Proportion of Labelled Outputs: ',100-unlabelProp/100

Average Proportion of matches:  62.380952381
Average Proportion of Labelled Outputs:  100.0


## Analysis of the Label Propagation
Accuracy w.r.t. the percent of labelled data after averaging over 100 iterations increases as the labelled samples in the training data set increases.
Accuracy w.r.t. the percent of correct labelling of the test data after averaging over 100 iterations increases as the labelled samples in the training data set increases.