#SVM-Boosting based on Markov resampling on PASCAL Dataset

In [1]:
# Import all dependencies
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import scale



---


Connecting  to the  Google drive for easy import and export of data


---


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive



---


Reading the csv file made for the dataset - And initializing the dataframe with it


---




In [3]:
pascal = pd.read_csv("/content/drive/MyDrive/DM/Image-pixels.csv")

In [4]:
pascal.shape

(4382, 22501)

In [5]:
col=[i for i in range(22500)]
col.append('label')
pascal.columns=col
pascal.columns

Index([      0,       1,       2,       3,       4,       5,       6,       7,
             8,       9,
       ...
         22491,   22492,   22493,   22494,   22495,   22496,   22497,   22498,
         22499, 'label'],
      dtype='object', length=22501)

## Starting with the algorithm : 


---


### SVM-Boosting based on Markov resampling


---



---

Initialising Parameters

---

In [6]:
markov= pd.DataFrame(columns = pascal.columns)
uniqCls=list(np.sort(pascal['label'].unique()))
classCNT=len(uniqCls)
limit=250
m=classCNT*limit

---

Parameters for Markov Sampling

---

In [7]:
k=5
q=1.2
rej=0

---

Train a linear Model on N[here 2000] size train set

----


In [8]:
X = pascal.drop("label", axis = 1)
y = pascal['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.8, random_state = 101)
model_linear = SVC(kernel='linear')
model_linear.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [9]:
predProb=[]



---


Utility loss Function


---



In [10]:
def lossF(actual,pred):
    if actual==pred:
        return 1.0
    return np.exp(-2)



---


Utility function for training subsequent models


---



In [11]:
def train(data):
    
    X = data.drop("label", axis = 1)
    y = data['label']

    # train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.01, random_state = 101)
    model_linear = SVC(kernel='linear')
    model_linear.fit(X_train, y_train)

    return model_linear

In [12]:
lst=[]



---


Loop To run the markov chain generator k times

---



In [13]:
t=0
T=3
while t<T:
    # Reset parameters for next markov chain
    markov= pd.DataFrame(columns = pascal.columns)
    predProb=[]
    lst=[]
    
    # Chosing a random sample as first of markov chain
    i=np.random.randint(pascal.shape[0])
    z0=pascal.iloc[i]
    y0=model_linear.predict(np.array([z0.drop('label')]))[0]
    
    l=0
    rej=0
    print("Entering...")
    while l<m:
        # choosing a random sample
        i=np.random.randint(pascal.shape[0])
        while i in lst:
            i=np.random.randint(pascal.shape[0])
        z1=pascal.iloc[i]
        y1=model_linear.predict(np.array([z1.drop('label')]))[0]
        n=lossF(z1['label'],y1)
        d=lossF(z0['label'],y0)
        p=min(1.0,n/d)
        
        flg=False
        # Deciding of acceptance of chosen sample and its probability in markov chain
        if rej>k:
            p=min(1.0,q*p)
            predProb.append([z1['label'],y1,p])
            markov=markov.append(z1)
            z0=z1
            l+=1
            flg=True
            rej=0
        elif p==1 and y1==y0:
            n=np.exp(-y1*z1['label'])
            d=np.exp(-y0*z0['label'])

            p=n/d
            p=min(p,1)
        if not(flg) and np.random.random() < p:
            predProb.append([z1['label'],y1,p])
            markov=markov.append(z1)
            z0=z1
            l+=1
            flg=True
            rej=0

        if not(flg):
            rej+=1
        lst.append(i)
    yTest=[]
    yPred=[]
    for i in predProb:
        yTest.append(i[0])
        yPred.append(i[1])
    et=(m-metrics.accuracy_score(y_true=yTest, y_pred=yPred,normalize=False))/m
    print(et)
    at=(1/2)*np.log((1-et)/et)
    t+=1
    model_linear=train(markov)

markov

Entering...
0.5342857142857143
Entering...
0.44114285714285717
Entering...
0.43714285714285717


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,22461,22462,22463,22464,22465,22466,22467,22468,22469,22470,22471,22472,22473,22474,22475,22476,22477,22478,22479,22480,22481,22482,22483,22484,22485,22486,22487,22488,22489,22490,22491,22492,22493,22494,22495,22496,22497,22498,22499,label
2046,189.0,191.0,193.0,195.0,195.0,197.0,219.0,196.0,198.0,198.0,198.0,200.0,92.0,37.0,81.0,111.0,96.0,93.0,110.0,114.0,112.0,120.0,108.0,115.0,110.0,113.0,130.0,213.0,239.0,238.0,245.0,108.0,74.0,132.0,125.0,148.0,99.0,86.0,96.0,100.0,...,69.0,63.0,56.0,64.0,55.0,61.0,58.0,60.0,62.0,62.0,58.0,60.0,67.0,62.0,58.0,59.0,61.0,64.0,65.0,78.0,57.0,66.0,63.0,67.0,66.0,55.0,58.0,63.0,60.0,60.0,59.0,57.0,59.0,56.0,52.0,55.0,55.0,59.0,53.0,3.0
2960,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,253.0,250.0,...,81.0,82.0,80.0,82.0,84.0,84.0,83.0,83.0,86.0,84.0,84.0,83.0,85.0,83.0,84.0,81.0,83.0,83.0,82.0,84.0,82.0,83.0,82.0,83.0,85.0,83.0,83.0,83.0,82.0,87.0,85.0,84.0,84.0,83.0,84.0,84.0,86.0,84.0,84.0,4.0
3015,235.0,225.0,236.0,232.0,182.0,223.0,213.0,251.0,249.0,247.0,250.0,250.0,251.0,251.0,243.0,231.0,245.0,252.0,245.0,251.0,249.0,230.0,231.0,189.0,235.0,244.0,252.0,233.0,202.0,196.0,254.0,248.0,254.0,254.0,255.0,242.0,190.0,234.0,234.0,227.0,...,178.0,171.0,174.0,166.0,164.0,169.0,159.0,188.0,176.0,180.0,165.0,179.0,176.0,168.0,170.0,159.0,153.0,151.0,153.0,153.0,136.0,124.0,129.0,156.0,160.0,151.0,176.0,152.0,130.0,179.0,174.0,137.0,152.0,97.0,88.0,108.0,112.0,162.0,138.0,4.0
4036,199.0,196.0,200.0,143.0,200.0,199.0,199.0,200.0,200.0,199.0,200.0,199.0,200.0,200.0,201.0,201.0,201.0,200.0,200.0,200.0,200.0,201.0,201.0,200.0,201.0,201.0,200.0,201.0,201.0,201.0,201.0,201.0,202.0,201.0,201.0,201.0,202.0,201.0,201.0,201.0,...,153.0,152.0,98.0,104.0,62.0,58.0,47.0,45.0,41.0,41.0,36.0,27.0,26.0,43.0,56.0,70.0,96.0,100.0,88.0,130.0,77.0,118.0,97.0,45.0,24.0,47.0,45.0,43.0,42.0,42.0,32.0,24.0,31.0,48.0,51.0,71.0,114.0,96.0,99.0,6.0
2402,200.0,202.0,204.0,203.0,206.0,82.0,45.0,49.0,17.0,38.0,72.0,51.0,116.0,38.0,60.0,213.0,214.0,47.0,50.0,47.0,105.0,80.0,79.0,58.0,49.0,88.0,171.0,218.0,231.0,218.0,218.0,213.0,215.0,216.0,212.0,213.0,213.0,215.0,215.0,213.0,...,124.0,134.0,132.0,135.0,128.0,130.0,131.0,136.0,141.0,150.0,156.0,154.0,167.0,176.0,173.0,190.0,180.0,190.0,184.0,176.0,190.0,184.0,193.0,179.0,199.0,199.0,183.0,219.0,225.0,227.0,229.0,231.0,225.0,222.0,217.0,219.0,221.0,222.0,214.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157,131.0,130.0,130.0,133.0,130.0,131.0,133.0,134.0,130.0,132.0,129.0,133.0,130.0,131.0,132.0,128.0,132.0,131.0,132.0,130.0,133.0,131.0,130.0,132.0,133.0,133.0,132.0,133.0,132.0,131.0,131.0,131.0,131.0,130.0,131.0,133.0,133.0,134.0,131.0,134.0,...,138.0,140.0,138.0,136.0,136.0,137.0,138.0,137.0,141.0,138.0,137.0,139.0,137.0,138.0,141.0,138.0,137.0,136.0,141.0,138.0,138.0,138.0,138.0,141.0,137.0,139.0,137.0,140.0,139.0,139.0,137.0,137.0,139.0,138.0,138.0,136.0,137.0,138.0,138.0,0.0
3368,120.0,102.0,161.0,148.0,132.0,100.0,147.0,141.0,97.0,133.0,158.0,116.0,98.0,134.0,128.0,79.0,167.0,129.0,124.0,116.0,210.0,105.0,111.0,107.0,127.0,128.0,128.0,106.0,185.0,174.0,165.0,141.0,103.0,106.0,84.0,137.0,143.0,76.0,102.0,103.0,...,117.0,105.0,97.0,109.0,99.0,106.0,108.0,94.0,89.0,104.0,99.0,98.0,98.0,86.0,68.0,98.0,106.0,102.0,100.0,95.0,89.0,90.0,88.0,81.0,80.0,79.0,87.0,89.0,62.0,54.0,92.0,133.0,145.0,167.0,173.0,182.0,185.0,186.0,186.0,5.0
898,11.0,14.0,30.0,17.0,31.0,35.0,30.0,24.0,25.0,24.0,39.0,43.0,39.0,42.0,37.0,35.0,37.0,36.0,35.0,41.0,39.0,40.0,39.0,42.0,41.0,40.0,39.0,42.0,40.0,40.0,39.0,48.0,49.0,41.0,46.0,47.0,44.0,43.0,41.0,46.0,...,50.0,55.0,50.0,44.0,48.0,58.0,56.0,56.0,54.0,41.0,43.0,50.0,50.0,49.0,55.0,59.0,49.0,52.0,50.0,48.0,52.0,59.0,56.0,54.0,54.0,48.0,53.0,55.0,46.0,45.0,51.0,52.0,47.0,54.0,48.0,55.0,61.0,64.0,56.0,1.0
2589,253.0,254.0,251.0,247.0,250.0,251.0,253.0,253.0,254.0,253.0,250.0,254.0,254.0,246.0,182.0,178.0,177.0,144.0,119.0,105.0,106.0,112.0,124.0,128.0,139.0,144.0,140.0,142.0,161.0,153.0,172.0,211.0,247.0,249.0,240.0,234.0,243.0,217.0,194.0,160.0,...,97.0,36.0,10.0,10.0,12.0,17.0,24.0,32.0,42.0,56.0,66.0,76.0,53.0,53.0,52.0,62.0,62.0,68.0,68.0,70.0,78.0,75.0,84.0,76.0,60.0,63.0,54.0,57.0,60.0,67.0,70.0,68.0,48.0,69.0,66.0,63.0,48.0,38.0,37.0,4.0


In [14]:
predProb

[[3.0, 3.0, 1.0],
 [4.0, 3.0, 1.0],
 [4.0, 4.0, 1.0],
 [6.0, 6.0, 1.0],
 [4.0, 4.0, 1.0],
 [1.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [6.0, 6.0, 1.0],
 [3.0, 3.0, 1.0],
 [1.0, 1.0, 1.0],
 [4.0, 4.0, 1.0],
 [5.0, 4.0, 1.0],
 [5.0, 5.0, 1.0],
 [2.0, 2.0, 1.0],
 [0.0, 0.0, 1.0],
 [6.0, 6.0, 1.0],
 [2.0, 0.0, 1.0],
 [3.0, 3.0, 1.0],
 [2.0, 2.0, 1.0],
 [0.0, 4.0, 1.0],
 [0.0, 0.0, 1.0],
 [4.0, 4.0, 1.0],
 [0.0, 4.0, 1.0],
 [3.0, 3.0, 1.0],
 [6.0, 2.0, 1.0],
 [2.0, 6.0, 1.0],
 [6.0, 6.0, 1.0],
 [4.0, 6.0, 1.0],
 [0.0, 0.0, 1.0],
 [1.0, 6.0, 0.1353352832366127],
 [0.0, 2.0, 1.0],
 [2.0, 2.0, 1.0],
 [3.0, 3.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [4.0, 4.0, 1.0],
 [6.0, 3.0, 1.0],
 [3.0, 5.0, 1.0],
 [4.0, 4.0, 1.0],
 [4.0, 3.0, 1.0],
 [2.0, 2.0, 1.0],
 [4.0, 4.0, 1.0],
 [0.0, 0.0, 1.0],
 [1.0, 1.0, 1.0],
 [6.0, 4.0, 1.0],
 [4.0, 0.0, 1.0],
 [4.0, 4.0, 1.0],
 [2.0, 5.0, 1.0],
 [5.0, 3.0, 1.0],
 [0.0, 3.0, 1.0],
 [0.0, 0.0, 1.0],
 [5.0, 0.0, 0.1353352832366127],
 [1.0, 0.0, 1.0],
 [0.0, 4.0, 1.0]

---

Save data from generated markov chain

---

In [15]:
markov.to_csv("/content/drive/MyDrive/DM/SVMBMSamplesPascal.csv")



---


Creating a new list for predicted probability and appending new column or field to save the probability in the markov chain generated 

---



In [16]:
prob=[]
for i in predProb:
  
    prob.append(i[2])

markov['probability']=prob

---


Save the Predicted Probability


---


In [17]:
markov.to_csv("/content/drive/MyDrive/DM/SVMBMSamplesPascalProbability.csv")

In [18]:
for i in lst:
    pascal=pascal.drop([i])
pascal.to_csv('/content/drive/MyDrive/DM/SVMBMremainingPascal.csv')

#SVM

In [19]:
train = pd.read_csv("/content/drive/MyDrive/DM/SVMBMSamplesPascal.csv")
test = pd.read_csv("/content/drive/MyDrive/DM/SVMBMremainingPascal.csv")

In [21]:
train = train.drop(train.columns[[0]], axis=1)
test = test.drop(test.columns[[0]], axis=1)

In [22]:
X_train = train.drop("label", axis = 1)
y_train = train["label"]

X_test = test.drop("label", axis = 1)
y_test = test["label"]


---


Accuracy for all the kernels - linear, rbf, chi-squared, hellinger, intersection



---



**Linear kernel**



In [23]:
model_linear = SVC(kernel='linear')
model_linear.fit(X_train, y_train)

y_pred = model_linear.predict(X_test)
print("The accuracy obtained for linear kernel is:", metrics.accuracy_score(y_true=y_test, y_pred=y_pred), "\n")

The accuracy obtained for linear kernel is: 0.24273858921161826 



**RBF kernel**

In [24]:
model_linear = SVC(kernel='rbf')
model_linear.fit(X_train, y_train)

y_pred = model_linear.predict(X_test)
print("The accuracy obtained for RBF kernel is:", metrics.accuracy_score(y_true=y_test, y_pred=y_pred), "\n")

The accuracy obtained for RBF kernel is: 0.33070539419087136 



**Chi-squared kernel**

In [25]:
from sklearn.metrics.pairwise import chi2_kernel

model_linear = SVC(kernel=chi2_kernel)
model_linear.fit(X_train, y_train)

y_pred = model_linear.predict(X_test)
print("The accuracy obtained for Chi-squared kernel is:", metrics.accuracy_score(y_true=y_test, y_pred=y_pred), "\n")

The accuracy obtained for Chi-squared kernel is: 0.23983402489626557 



**Hellinger kernel**

In [26]:
def hellinger(X1, X2):
  return np.sqrt(np.dot(X1,X2.T))
   

model_linear = SVC(kernel=hellinger)
model_linear.fit(X_train, y_train)

y_pred = model_linear.predict(X_test)
print("The accuracy obtained for hellinger kernel is:", metrics.accuracy_score(y_true=y_test, y_pred=y_pred), "\n")

The accuracy obtained for hellinger kernel is: 0.1892116182572614 



**Intersection kernel**

In [None]:
# Due to time and resources constraint, were unable to run for this kernel (Intersection Kernel)
from sklearn.metrics.pairwise import euclidean_distances

def intersection(X1,X2):

  result = np.zeros((X1.shape[0],X2.shape[0]))
  X2=X2.T

  for i in range(len(X1)):
    for j in range(len(X2[0])):
      val=float('+inf')
      for k in range(len(X2)):
        val = min(val,X1[i][k] * X2[k][j])

      result[i][j]=val

  return result
 
model_linear = SVC(kernel=intersection)
model_linear.fit(X_train, y_train)

y_pred = model_linear.predict(X_test)
print("The accuracy obtained for Intersection Kernel is:", metrics.accuracy_score(y_true=y_test, y_pred=y_pred), "\n")

