In [10]:
import numpy as np 
import pandas as pd
from sklearn.cluster import KMeans
import csv
import math
import matplotlib.pyplot
from matplotlib import pyplot as plt
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score

maxAcc = 0.0
maxIter = 0
C_Lambda = 0.03
TrainingPercent = 80
ValidationPercent = 10
TestPercent = 10
M = 10#5#10
PHI = []
IsSynthetic = False
IsConcat = True
humanDataSampleSize=791
gscDataSampleSize=5000
humanFeaturesFile='HumanObserved-Features-Data.csv'
humanSamePairFile='same_pairs.csv'
humanDiffPairFile='diffn_pairs.csv'
gscFeatureFile = 'GSC-Features.csv'
gscSamePairFile='same_pairs_gsc.csv'
gscDiffPairFile='diffn_pairs_gsc.csv'

In [4]:
def processData(data,samePair,diffPair,sampleSize,isGSC):
    
    if(isGSC == False):
        data = pd.read_csv(data,index_col=0)
    else:
        data = pd.read_csv(data)


    samePair = pd.read_csv(samePair)


    diffPair = pd.read_csv(diffPair)


    merged1 = pd.merge(samePair, data, left_on = ['img_id_A'],right_on= ['img_id'],how = 'inner').drop(['img_id'], axis='columns')
    merged2 = pd.merge(samePair, data, left_on = ['img_id_B'],right_on= ['img_id'],how = 'inner').drop(['img_id','img_id_A','img_id_B','target'], axis='columns')

    merged3=pd.concat([merged1, merged2],axis=1)


    merged4=merged1 - merged2
    merged4=merged4.drop(['img_id_A','img_id_B','target'], axis='columns')
    merged4=pd.concat([merged4,merged1['target']],axis=1)



    diffmerged1 =pd.merge(diffPair, data, left_on = ['img_id_A'],right_on= ['img_id'],how = 'inner').drop(['img_id'], axis='columns')
    diffmerged2 = pd.merge(diffPair, data, left_on = ['img_id_B'],right_on= ['img_id'],how = 'inner').drop(['img_id','img_id_A','img_id_B','target'], axis='columns')


    diffmerged3=pd.concat([diffmerged1, diffmerged2],axis=1).sample(n=sampleSize)


    diffmerged4=diffmerged1 - diffmerged2
    diffmerged4=diffmerged4.drop(['img_id_A','img_id_B','target'], axis='columns')
    diffmerged4=pd.concat([diffmerged4,diffmerged1['target']],axis=1)
    diffmerged5=diffmerged4.sample(n=sampleSize).abs()


    if(isGSC == True):
        merged3=merged3.sample(n=sampleSize)
        subtractionFeatures=merged4.sample(n=sampleSize).abs()
    else:
        merged3=merged3
        subtractionFeatures=merged4.abs()

    concatFeatures=merged3.append(diffmerged3)    
    concatFeatures=shuffle(concatFeatures)
    print("Feature Concatenation:"+str(concatFeatures.shape))
    subtractionFeatures=subtractionFeatures.append(diffmerged5)
    subtractionFeatures=shuffle(subtractionFeatures)
    print("Feature Subtraction:"+str(subtractionFeatures.shape))
    return concatFeatures,subtractionFeatures

In [5]:
def GetTargetVector(features):

    target=features['target'].values.T.tolist()

    return target


def GenerateRawData(features,isConcat):
    
    #features = features.loc[:, (features != 0).any(axis=0)]
    if(isConcat == True):
        dataMatrix = features.drop(columns=['img_id_A','img_id_B','target']).as_matrix()
    else:
        dataMatrix = features.drop(columns=['target']).as_matrix()
    dataMatrix = np.transpose(dataMatrix)
    #print (dataMatrix)

    return dataMatrix

def GenerateTrainingTarget(rawTraining,TrainingPercent = 80):
    TrainingLen = int(math.ceil(len(rawTraining)*(TrainingPercent*0.01)))
    t           = rawTraining[:TrainingLen]
    #print(str(TrainingPercent) + "% Training Target Generated..")
    return t


def GenerateTrainingDataMatrix(rawData, TrainingPercent = 80):
    T_len = int(math.ceil(len(rawData[0])*0.01*TrainingPercent))
    d2 = rawData[:,0:T_len]
    #print(str(TrainingPercent) + "% Training Data Generated..")
    return d2

def GenerateValData(rawData, ValPercent, TrainingCount): 
    valSize = int(math.ceil(len(rawData[0])*ValPercent*0.01))
    V_End = TrainingCount + valSize
    dataMatrix = rawData[:,TrainingCount+1:V_End]
    #print (str(ValPercent) + "% Val Data Generated..")  
    return dataMatrix

def GenerateValTargetVector(rawData, ValPercent, TrainingCount): 
    valSize = int(math.ceil(len(rawData)*ValPercent*0.01))
    #print(rawData)
    V_End = TrainingCount + valSize
    t =rawData[TrainingCount+1:V_End]
    #print (str(ValPercent) + "% Val Target Data Generated..")
    #print(t)
    return t

In [6]:
def generateFeatureData(isConcat,data,samePair,diffPair,sampleSize,isGSC):
    concatFeatures,subtractionFeatures = processData(data,samePair,diffPair,sampleSize,isGSC)
    return  concatFeatures,subtractionFeatures

def generateData(features,isConcat):
    RawTarget = GetTargetVector(features)
    RawData   = GenerateRawData(features,isConcat)

    
    
    
    return  RawTarget,RawData  

#Creating Training dataset
def createTrainingDataset(RawTarget,RawData):
    TrainingTarget = np.array(GenerateTrainingTarget(RawTarget,TrainingPercent))
    TrainingData   = GenerateTrainingDataMatrix(RawData,TrainingPercent)
    print("TrainingTarget Data: "+str(TrainingTarget.shape))
    print("Training Data: "+str(TrainingData.shape))
    return TrainingTarget,TrainingData


#Prepare Validation data
def createValidationDataset(RawTarget,RawData,TrainingTarget):
    
    ValDataAct = np.array(GenerateValTargetVector(RawTarget,ValidationPercent, (len(TrainingTarget))))
    ValData    = GenerateValData(RawData,ValidationPercent, (len(TrainingTarget)))
    print("ValidationTarget Data: "+str(ValDataAct.shape))
    print("Validation Data: "+str(ValData.shape))
    return ValDataAct,ValData

#Prepare Testing data
def createTestingDataset(RawTarget,RawData,TrainingTarget,ValDataAct):
    
    TestDataAct = np.array(GenerateValTargetVector(RawTarget,TestPercent, (len(TrainingTarget)+len(ValDataAct))))
    TestData = GenerateValData(RawData,TestPercent, (len(TrainingTarget)+len(ValDataAct)))
    print("TestingTarget Data: "+str(TestDataAct.shape))
    print("Testing Data: "+str(TestData.shape))
    return TestDataAct,TestData


In [7]:
def sigmoid(features):
    return 1 / (1 + np.exp(-features))

def GetAccuracy(VAL_TEST_OUT,ValDataAct):
    sum = 0.0
    t=0
    accuracy = 0.0
    counter = 0
    val = 0.0
    valPred = []
    for i in range (0,len(VAL_TEST_OUT)):
    	valPred.append(float(np.around(VAL_TEST_OUT[i], 0)))

    
    accuracy = accuracy_score(ValDataAct, valPred)
 
    return (str(accuracy * 100))

In [8]:
#Feature Generation
concatFeatures,subtractionFeatures=generateFeatureData(IsConcat,humanFeaturesFile,humanSamePairFile
                                  ,humanDiffPairFile,humanDataSampleSize,False)
#Feature Concatenation
concatRawTarget,concatRawData = generateData(concatFeatures,IsConcat)
concatTrainingTarget,concatTrainingData=createTrainingDataset(concatRawTarget,concatRawData)
concatValidationTarget,concatValidationData=createValidationDataset(concatRawTarget,concatRawData,concatTrainingTarget)
concatTestDataAct,concatTestData=createTestingDataset(concatRawTarget,concatRawData,concatTrainingTarget,concatValidationTarget)

Feature Concatenation:(1582, 21)
Feature Subtraction:(1582, 10)
TrainingTarget Data: (1266,)
Training Data: (18, 1266)
ValidationTarget Data: (158,)
Validation Data: (18, 158)
TestingTarget Data: (157,)
Testing Data: (18, 157)


  if sys.path[0] == '':


In [67]:
def runLogisticRegression(featureType,datasetType,TrainingData,RawData,ValData,TestData,
                        TrainingTarget,ValDataAct,TestDataAct):
    
    W_Now        = np.zeros(RawData.shape[0])
    La           = 0.0001
    learningRate = 0.5
    L_Acc_Val   = []
    L_Acc_TR    = []
    L_Acc_Test  = []
    W_Mat        = []
    L_Acc_Test_Acc  = []
    iteration = []

    for i in range(0,400):
        
    
        #print ('---------Iteration: ' + str(i) + '--------------')
        pred = np.dot(np.transpose(W_Now),TrainingData)
        a = sigmoid(pred)
        error = a - TrainingTarget

        Delta_W = np.multiply(TrainingData,error) 

        Delta_W_Mean=Delta_W.sum(axis=1) / RawData.shape[0]

        Regularizer = np.dot(La,W_Now) / RawData.shape[0]
        Delta_W_Mean_Reg = np.add(Delta_W_Mean,Regularizer) 

        x=np.dot(learningRate,Delta_W_Mean_Reg)

        W_Next=W_Now - x

        W_Now=W_Next

        TR_TEST_OUT   = np.dot(np.transpose(W_Next),TrainingData)
        a_test = sigmoid(TR_TEST_OUT)
        Acc_TR       = GetAccuracy(a_test,TrainingTarget)
        L_Acc_TR.append(float(Acc_TR.split(',')[0]))


        VAL_TEST_OUT  = np.dot(np.transpose(W_Next),ValData)
        v_test = sigmoid(VAL_TEST_OUT) 
        Acc_Val      = GetAccuracy(v_test,ValDataAct)
        L_Acc_Val.append(float(Acc_Val.split(',')[0]))
    
        #-----------------TestingData Accuracy---------------------#
        TEST_OUT      = np.dot(np.transpose(W_Next),TestData)
        t_test = sigmoid(TEST_OUT)
        #print(t_test)
        Acc_Test = GetAccuracy(t_test,TestDataAct)
        #L_Erms_Test.append(float(Erms_Test.split(',')[1]))
        L_Acc_Test_Acc.append(float(Acc_Test.split(',')[0]))
        if(i%5==0):
            iteration.append(i)


    print ('----------Gradient Descent Solution--------------------')
    print ('UBITname      = APURBAMA')
    print ('Person Number = 50288705')
    print ('----------------------------------------------------')
    print ("------------------"+str(datasetType)+"----" + str(featureType)+ " Dataset------------------")
    print ('------------------Logistic Regression Model----------------------')
    print (" \nLambda  = 0.0001 \neta=0.5")
    print ("Training Accuracy  = " + str(np.around(max(L_Acc_TR),5)))
    print ("Validation Accuracy= " + str(np.around(max(L_Acc_Val),5)))

    print ("Testing Accuracy    = " + str(np.around(max(L_Acc_Test_Acc),5)))
    
    
 
    



In [68]:
#Running Logistic Regression on Human COncatenation Dataset
runLogisticRegression('Human','Feature Concatenation',concatTrainingData,concatRawData,concatValidationData
                    ,concatTestData,concatTrainingTarget,concatValidationTarget,concatTestDataAct)

----------Gradient Descent Solution--------------------
UBITname      = APURBAMA
Person Number = 50288705
----------------------------------------------------
------------------Feature Concatenation----Human Dataset------------------
------------------Logistic Regression Model----------------------
 
Lambda  = 0.0001 
eta=0.5
Training Accuracy  = 52.05371
Validation Accuracy= 55.6962
Testing Accuracy    = 57.96178


In [53]:

#Feature Subtraction
subRawTarget,subRawData = generateData(subtractionFeatures,False)
subTrainingTarget,subTrainingData=createTrainingDataset(subRawTarget,subRawData)
subValidationTarget,subValidationData=createValidationDataset(subRawTarget,subRawData,subTrainingTarget)
subTestDataAct,subTestData=createTestingDataset(subRawTarget,subRawData,subTrainingTarget,subValidationTarget)

TrainingTarget Data: (1266,)
Training Data: (9, 1266)
ValidationTarget Data: (158,)
Validation Data: (9, 158)
TestingTarget Data: (157,)
Testing Data: (9, 157)


  


In [69]:
#Running Logistic Regression on Human Subtraction Dataset
runLogisticRegression('Human','Feature Subtraction',subTrainingData,subRawData,subValidationData
                    ,subTestData,subTrainingTarget,subValidationTarget,subTestDataAct)

----------Gradient Descent Solution--------------------
UBITname      = APURBAMA
Person Number = 50288705
----------------------------------------------------
------------------Feature Subtraction----Human Dataset------------------
------------------Logistic Regression Model----------------------
 
Lambda  = 0.0001 
eta=0.5
Training Accuracy  = 53.08057
Validation Accuracy= 54.43038
Testing Accuracy    = 56.6879


In [57]:
#GSC Feature Generation
gscConcatFeatures,gscSubtractionFeatures=generateFeatureData(IsConcat,gscFeatureFile,gscSamePairFile
                                  ,gscDiffPairFile,gscDataSampleSize,True)


Feature Concatenation:(10000, 1027)
Feature Subtraction:(10000, 513)


In [58]:
#GSC Feature Concatenation
gscConcatRawTarget,gscConcatRawData = generateData(gscConcatFeatures,IsConcat)
gscConcatTrainingTarget,gscConcatTrainingData=createTrainingDataset(gscConcatRawTarget,gscConcatRawData)
gscConcatValidationTarget,gscConcatValidationData=createValidationDataset(gscConcatRawTarget,gscConcatRawData,gscConcatTrainingTarget)
gscConcatTestDataAct,gscConcatTestData=createTestingDataset(gscConcatRawTarget,gscConcatRawData,gscConcatTrainingTarget,gscConcatValidationTarget)

TrainingTarget Data: (8000,)
Training Data: (1024, 8000)
ValidationTarget Data: (999,)
Validation Data: (1024, 999)
TestingTarget Data: (999,)
Testing Data: (1024, 999)


  if sys.path[0] == '':


In [59]:
#Running Linear Regression on GSC Concatenation Dataset
runLogisticRegression('GSC','Feature Concatenation',gscConcatTrainingData,gscConcatRawData,gscConcatValidationData
                    ,gscConcatTestData,gscConcatTrainingTarget,gscConcatValidationTarget,gscConcatTestDataAct)

----------Gradient Descent Solution--------------------
UBITname      = APURBAMA
Person Number = 50288705
----------------------------------------------------
------------------Feature Concatenation----GSC Dataset------------------
------------------Logistic Regression Model----------------------
 
Lambda  = 0.0001 
eta=0.5
Training Accuracy  = 53.025
Validation Accuracy= 53.35335
Testing Accuracy    = 52.65265


In [60]:
#GSC Feature Subtraction
gscSubRawTarget,gscSubRawData = generateData(gscSubtractionFeatures,False)
gscSubTrainingTarget,gscSubTrainingData=createTrainingDataset(gscSubRawTarget,gscSubRawData)
gscSubValidationTarget,gscSubValidationData=createValidationDataset(gscSubRawTarget,gscSubRawData,gscSubTrainingTarget)
gscSubTestDataAct,gscSubTestData=createTestingDataset(gscSubRawTarget,gscSubRawData,gscSubTrainingTarget,gscSubValidationTarget)

TrainingTarget Data: (8000,)
Training Data: (512, 8000)
ValidationTarget Data: (999,)
Validation Data: (512, 999)
TestingTarget Data: (999,)
Testing Data: (512, 999)


  


In [61]:
#Running Linear Regression on GSC Subtraction Dataset
runLogisticRegression('GSC','Feature Subtraction',gscSubTrainingData,gscSubRawData,gscSubValidationData
                    ,gscSubTestData,gscSubTrainingTarget,gscSubValidationTarget,gscSubTestDataAct)

----------Gradient Descent Solution--------------------
UBITname      = APURBAMA
Person Number = 50288705
----------------------------------------------------
------------------Feature Subtraction----GSC Dataset------------------
------------------Logistic Regression Model----------------------
 
Lambda  = 0.0001 
eta=0.5
Training Accuracy  = 69.55
Validation Accuracy= 69.16917
Testing Accuracy    = 68.86887
