In [46]:
import pandas as pd
import numpy as np
import csv
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import math
from sklearn.cluster import KMeans
import matplotlib.pyplot
from matplotlib import pyplot as plt

In [47]:
def getHumanObservedData_From_Files():
    features_df = pd.read_csv('HumanObserved-Dataset/HumanObserved-Features-Data/HumanObserved-Features-Data.csv')
    diffn_pairs_df = pd.read_csv('HumanObserved-Dataset/HumanObserved-Features-Data/diffn_pairs.csv')    
    same_pairs_df = pd.read_csv('HumanObserved-Dataset/HumanObserved-Features-Data/same_pairs.csv')
    features_df.drop(['Unnamed: 0'],axis=1,inplace=True)
    return features_df, same_pairs_df, diffn_pairs_df

In [48]:
def getGSCData_From_Files():
    features_df = pd.read_csv('GSC-Dataset/GSC-Features-Data/GSC-Features.csv')
    diffn_pairs_df = pd.read_csv('GSC-Dataset/GSC-Features-Data/diffn_pairs.csv')
    same_pairs_df = pd.read_csv('GSC-Dataset/GSC-Features-Data/same_pairs.csv')
    return features_df, same_pairs_df, diffn_pairs_df

In [49]:
def getHumanObservedData_Concatenated():
    features_df, same_pairs_df, diffn_pairs_df = getHumanObservedData_From_Files()    
    concat_same_df = pd.DataFrame()

    concat_same_df['img_id_A']=same_pairs_df.img_id_A
    concat_same_df['img_id_B']=same_pairs_df.img_id_B
    concat_same_df=pd.merge(concat_same_df,features_df,left_on='img_id_A', right_on='img_id')
    concat_same_df=pd.merge(concat_same_df,features_df,left_on='img_id_B', right_on='img_id',suffixes=('_a', '_b'))
    concat_same_df.drop(['img_id_a','img_id_b'],axis=1,inplace=True)
    #concat_same_df=pd.merge(concat_same_df,same_pairs_df)
    concat_same_df['target']=same_pairs_df.target
    
    concat_diffn_df=pd.DataFrame()

    concat_diffn_df['img_id_A']=diffn_pairs_df.img_id_A
    concat_diffn_df['img_id_B']=diffn_pairs_df.img_id_B
    concat_diffn_df=pd.merge(concat_diffn_df,features_df,left_on='img_id_A', right_on='img_id')
    concat_diffn_df=pd.merge(concat_diffn_df,features_df,left_on='img_id_B', right_on='img_id',suffixes=('_a', '_b'))
    concat_diffn_df.drop(['img_id_a','img_id_b'],axis=1,inplace=True)
    #concat_diffn_df=pd.merge(concat_diffn_df,diffn_pairs_df)
    concat_diffn_df['target'] = diffn_pairs_df.target

    
    concat_full_df=concat_same_df.append(concat_diffn_df.iloc[0:800,:])
    concat_full_df = concat_full_df.sample(frac=1).reset_index(drop=True)
    
    X_train_and_cv_concat, X_test_concat, y_train_and_cv_concat, y_test_concat = train_test_split(concat_full_df.iloc[:,2:-1], concat_full_df['target'].values, test_size=0.2)
    X_train_concat, X_cv_concat, y_train_concat, y_cv_concat = train_test_split(X_train_and_cv_concat, y_train_and_cv_concat, test_size=0.25)

    return X_train_concat, y_train_concat,  X_cv_concat, y_cv_concat, X_test_concat, y_test_concat
    

In [50]:
def getHumanObservedData_Subtracted():
    features_df, same_pairs_df, diffn_pairs_df = getHumanObservedData_From_Files()   
    subtraction_same_df=pd.DataFrame()

    subtraction_same_df['img_id_A']=same_pairs_df.img_id_A
    subtraction_same_df['img_id_B']=same_pairs_df.img_id_B
    subtraction_same_df=pd.merge(subtraction_same_df,features_df,left_on='img_id_A', right_on='img_id')
    subtraction_same_df.drop(['img_id'],axis=1,inplace=True)
    subtraction_same_df.iloc[:,1:].subtract(features_df.iloc[:,1:])
    subtraction_same_df=pd.merge(subtraction_same_df,same_pairs_df)
    
    subtraction_diffn_df=pd.DataFrame()

    subtraction_diffn_df['img_id_A']=diffn_pairs_df.img_id_A
    subtraction_diffn_df['img_id_B']=diffn_pairs_df.img_id_B
    subtraction_diffn_df=pd.merge(subtraction_diffn_df,features_df,left_on='img_id_A', right_on='img_id')
    subtraction_diffn_df.drop(['img_id'],axis=1,inplace=True)
    subtraction_diffn_df.iloc[:,1:].subtract(features_df.iloc[:,1:])
    subtraction_diffn_df=pd.merge(subtraction_diffn_df,diffn_pairs_df)
    
    subtraction_full_df=subtraction_same_df.append(subtraction_diffn_df.iloc[0:800,:])
    subtraction_full_df = subtraction_full_df.sample(frac=1).reset_index(drop=True)
    
    X_train_and_cv_subtract, X_test_subtract, y_train_and_cv_subtract, y_test_subtract = train_test_split(subtraction_full_df.iloc[:,2:-1], subtraction_full_df['target'].values, test_size=0.2)
    X_train_subtract, X_cv_subtract, y_train_subtract, y_cv_subtract = train_test_split(X_train_and_cv_subtract, y_train_and_cv_subtract, test_size=0.25)

    return X_train_subtract,  y_train_subtract, X_cv_subtract, y_cv_subtract, X_test_subtract, y_test_subtract

In [51]:
def getGSCData_Concatenated():
    features_df, same_pairs_df, diffn_pairs_df = getGSCData_From_Files()  
    concat_same_df=pd.DataFrame()

    concat_same_df['img_id_A']=same_pairs_df.img_id_A
    concat_same_df['img_id_B']=same_pairs_df.img_id_B
    concat_same_df=pd.merge(concat_same_df,features_df,left_on='img_id_A', right_on='img_id')
    concat_same_df=pd.merge(concat_same_df,features_df,left_on='img_id_B', right_on='img_id',suffixes=('_a', '_b'))
    concat_same_df.drop(['img_id_a','img_id_b'],axis=1,inplace=True)
    concat_same_df=pd.merge(concat_same_df,same_pairs_df)
    
    concat_diffn_df=pd.DataFrame()

    concat_diffn_df['img_id_A']=diffn_pairs_df.img_id_A
    concat_diffn_df['img_id_B']=diffn_pairs_df.img_id_B
    concat_diffn_df=pd.merge(concat_diffn_df,features_df,left_on='img_id_A', right_on='img_id')
    concat_diffn_df=pd.merge(concat_diffn_df,features_df,left_on='img_id_B', right_on='img_id',suffixes=('_a', '_b'))
    concat_diffn_df.drop(['img_id_a','img_id_b'],axis=1,inplace=True)
    concat_diffn_df=pd.merge(concat_diffn_df,diffn_pairs_df)
    
    concat_full_df=concat_same_df.append(concat_diffn_df.iloc[0:72000,:])
    
    concat_full_df = concat_full_df.sample(frac=1).reset_index(drop=True)
    
    X_train_and_cv_concat, X_test_concat, y_train_and_cv_concat, y_test_concat = train_test_split(concat_full_df.iloc[:,2:-1], concat_full_df['target'].values, test_size=0.2)
    X_train_concat, X_cv_concat, y_train_concat, y_cv_concat = train_test_split(X_train_and_cv_concat, y_train_and_cv_concat, test_size=0.25)

    return X_train_concat, y_train_concat, X_cv_concat, y_cv_concat, X_test_concat, y_test_concat
    

In [52]:
def getGSCData_Subtracted():
    features_df, same_pairs_df, diffn_pairs_df = getGSCData_From_Files() 
    subtraction_same_df=pd.DataFrame()

    subtraction_same_df['img_id_A']=same_pairs_df.img_id_A
    subtraction_same_df['img_id_B']=same_pairs_df.img_id_B
    subtraction_same_df=pd.merge(subtraction_same_df,features_df,left_on='img_id_A', right_on='img_id')
    subtraction_same_df.drop(['img_id'],axis=1,inplace=True)
    subtraction_same_df.iloc[:,1:].subtract(features_df.iloc[:,1:])
    subtraction_same_df=pd.merge(subtraction_same_df,same_pairs_df)


    subtraction_diffn_df=pd.DataFrame()

    subtraction_diffn_df['img_id_A']=diffn_pairs_df.img_id_A
    subtraction_diffn_df['img_id_B']=diffn_pairs_df.img_id_B
    subtraction_diffn_df=pd.merge(subtraction_diffn_df,features_df,left_on='img_id_A', right_on='img_id')
    subtraction_diffn_df.drop(['img_id'],axis=1,inplace=True)
    subtraction_diffn_df.iloc[:,1:].subtract(features_df.iloc[:,1:])
    subtraction_diffn_df=pd.merge(subtraction_diffn_df,diffn_pairs_df)

    subtraction_full_df = subtraction_same_df.append(subtraction_diffn_df.iloc[0:72000,:])
    subtraction_full_df = subtraction_full_df.sample(frac=1).reset_index(drop=True)
    
    X_train_and_cv_subtract, X_test_subtract, y_train_and_cv_subtract, y_test_subtract = train_test_split(subtraction_full_df.iloc[:,2:-1], subtraction_full_df['target'].values, test_size=0.2)
    X_train_subtract, X_cv_subtract, y_train_subtract, y_cv_subtract = train_test_split(X_train_and_cv_subtract, y_train_and_cv_subtract, test_size=0.25)
    
    return X_train_subtract,  y_train_subtract, X_cv_subtract, y_cv_subtract, X_test_subtract, y_test_subtract    

In [53]:
def insert_intercept(X):
    intercept = np.ones((X.shape[0],1))
    return np.concatenate((intercept,X),axis=1)

def initialize_weights(X):
    w = np.zeros(X.shape[1])
    return w

def sigmoid(z):
    return 1 /(1 + np.exp(-z))

def logistic_loss(h, y):
        return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
    
def GetErms(VAL_TEST_OUT,ValDataAct):
    sum = 0.0
    t=0
    accuracy = 0.0
    counter = 0
    val = 0.0
    for i in range (0,len(VAL_TEST_OUT)):
        sum = sum + math.pow((ValDataAct[i] - VAL_TEST_OUT[i]),2)
        if(int(np.around(VAL_TEST_OUT[i], 0)) == ValDataAct[i]):
            counter+=1
    accuracy = (float((counter*100))/float(len(VAL_TEST_OUT)))
    return (str(accuracy) + ',' +  str(math.sqrt(sum/len(VAL_TEST_OUT))))

def predict(h, threshold):
    return (h >= threshold)
    
def accuracy(predicted,actual):
    correct = np.ones(predicted.shape[0])[predicted==actual]
    return (correct.sum()/predicted.shape[0])

def gradient_descent(X,y,w,alpha=0.01,reg_param=0,iterations=10000):
    loss=[]
    erms=[]
    for i in tqdm(range(0,iterations)):
        z=np.dot(X,w)       
        diff_vector = sigmoid(z) - y
        delta_E = np.dot(X.T,diff_vector)/X.shape[0]        
        reg_term = np.dot(reg_param,w[1:])
        update_term_0 = -np.dot((alpha/X.shape[0]),delta_E[0])
        update_term = -np.dot((alpha/X.shape[0]),np.add(delta_E[1:],reg_term))
        w[0]=np.add(w[0],update_term[0])
        w[1:]=np.add(w[1:],update_term)        
        update_term = alpha*delta_E
        w-=update_term
    return w





In [54]:
def LogisticRegression(X,y,learning_rate=0.01,la=0,iterate=1000000):
    w = initialize_weights(X)
    W_trained = gradient_descent(X,y,w,alpha=learning_rate,reg_param=la,iterations=iterate)
    return W_trained
    

In [55]:
def get_logisticRegression_metrics(X,y,W_trained):
    h = sigmoid(np.dot(X,W_trained))  # Replace W_Now by W_concat_trained
    predictions = predict(h, 0.5)
    erms = GetErms(h,y).split(',')[1]
    loss_logistic = logistic_loss(h,y)
    acc = accuracy(predictions,y)
    return erms,loss_logistic,acc

def GetErms(VAL_TEST_OUT,ValDataAct):
    sum = 0.0
    t=0
    accuracy = 0.0
    counter = 0
    val = 0.0 
    for i in range (0,len(VAL_TEST_OUT)):
        sum = sum + math.pow((ValDataAct[i] - VAL_TEST_OUT[i]),2)
        if(int(np.around(VAL_TEST_OUT[i], 0)) == ValDataAct[i]):
            counter+=1
    accuracy = (float((counter*100))/float(len(VAL_TEST_OUT)))
    ##print ("Accuracy Generated..")
    ##print ("Validation E_RMS : " + str(math.sqrt(sum/len(VAL_TEST_OUT))))
    return (str(accuracy) + ',' +  str(math.sqrt(sum/len(VAL_TEST_OUT))))

### Human Observed Data Set

#### Logistic Regression Model:

##### Concatenated

In [56]:
X_train_hod_con,y_train_hod_con,X_cv_hod_con,y_cv_hod_con,X_test_hod_con,y_test_hod_con = getHumanObservedData_Concatenated()

X_train_hod_con = insert_intercept(X_train_hod_con)
X_cv_hod_con = insert_intercept(X_cv_hod_con)
X_test_hod_con = insert_intercept(X_test_hod_con)

In [57]:
W_trained_hod_con = LogisticRegression(X_train_hod_con,y_train_hod_con)

100%|██████████| 1000000/1000000 [00:43<00:00, 23182.50it/s]


In [58]:
train_loss,train_loss_logistic,train_accuracy = get_logisticRegression_metrics(X_train_hod_con,y_train_hod_con,W_trained_hod_con)
cv_loss,cv_loss_logistic,cv_accuracy = get_logisticRegression_metrics(X_cv_hod_con,y_cv_hod_con,W_trained_hod_con)
test_loss,test_loss_logistic,test_accuracy = get_logisticRegression_metrics(X_test_hod_con,y_test_hod_con,W_trained_hod_con)

print(f'Logistic Regression Loss On Training Data: {train_loss_logistic}')
print(f'ERMS Loss On Training Data: {train_loss}\nAccuracy On Training Data: {train_accuracy}')
print(f'Loss On CV Data: {cv_loss}\nAccuracy On CV Data: {cv_accuracy}')
print(f'Loss On Test Data: {test_loss}\nAccuracy On Test Data: {test_accuracy}')



Logistic Regression Loss On Training Data: 0.36788413820929255
ERMS Loss On Training Data: 0.33806033157156323
Accuracy On Training Data: 0.8542976939203354
Loss On CV Data: 0.33400884972241557
Accuracy On CV Data: 0.8710691823899371
Loss On Test Data: 0.31899720154649436
Accuracy On Test Data: 0.890282131661442


##### Subtracted

In [80]:
X_train_hod_sub,y_train_hod_sub,X_cv_hod_sub,y_cv_hod_sub,X_test_hod_sub,y_test_hod_sub = getHumanObservedData_Subtracted()

X_train_hod_sub = insert_intercept(X_train_hod_sub)
X_cv_hod_sub  = insert_intercept(X_cv_hod_sub)
X_test_hod_sub  = insert_intercept(X_test_hod_sub)


In [81]:
W_trained_hod_sub = LogisticRegression(X_train_hod_sub,y_train_hod_sub)

100%|██████████| 1000000/1000000 [00:38<00:00, 25949.38it/s]


In [82]:
train_loss,train_loss_logistic,train_accuracy = get_logisticRegression_metrics(X_train_hod_sub,y_train_hod_sub,W_trained_hod_sub)
cv_loss,cv_loss_logistic,cv_accuracy = get_logisticRegression_metrics(X_cv_hod_sub,y_cv_hod_sub,W_trained_hod_sub)
test_loss,test_loss_logistic,test_accuracy = get_logisticRegression_metrics(X_test_hod_sub,y_test_hod_sub,W_trained_hod_sub)

print(f'Logistic Regression Loss On Training Data: {train_loss_logistic}')
print(f'Loss On Training Data: {train_loss}\nAccuracy On Training Data: {train_accuracy}')
print(f'Loss On CV Data: {cv_loss}\nAccuracy On CV Data: {cv_accuracy}')
print(f'Loss On Test Data: {test_loss}\nAccuracy On Test Data: {test_accuracy}')

Logistic Regression Loss On Training Data: nan
Loss On Training Data: 0.21913370043476285
Accuracy On Training Data: 0.9433962264150944
Loss On CV Data: 0.20955312505760454
Accuracy On CV Data: 0.949685534591195
Loss On Test Data: 0.20193281019609147
Accuracy On Test Data: 0.9498432601880877


  del sys.path[0]
  del sys.path[0]


#### Linear Regression Model:

In [62]:
def GenerateBigSigma(Data, MuMatrix,TrainingPercent,IsSynthetic=False):
    BigSigma    = np.zeros((len(Data),len(Data)))
    DataT       = np.transpose(Data)
    TrainingLen = math.ceil(len(DataT)*(TrainingPercent*0.01))        
    varVect     = []
    for i in range(0,len(DataT[0])):
        vct = []
        for j in range(0,int(TrainingLen)):
            vct.append(Data[i][j])    
        varVect.append(np.var(vct))
    
    for j in range(len(Data)):
        BigSigma[j][j] = varVect[j]
    if IsSynthetic == True:
        BigSigma = np.dot(3,BigSigma)
    else:
        BigSigma = np.dot(200,BigSigma)
    ##print ("BigSigma Generated..")
    return BigSigma

def GetScalar(DataRow,MuRow, BigSigInv):  
    R = np.subtract(DataRow,MuRow)
    T = np.dot(BigSigInv,np.transpose(R))  
    L = np.dot(R,T)
    return L

def GetRadialBasisOut(DataRow,MuRow, BigSigInv):    
    phi_x = math.exp(-0.5*GetScalar(DataRow,MuRow,BigSigInv))
    return phi_x

def GetPhiMatrix(Data, MuMatrix, BigSigma, TrainingPercent = 100):
    DataT = np.transpose(Data)
    TrainingLen = math.ceil(len(DataT)*(TrainingPercent*0.01))         
    PHI = np.zeros((int(TrainingLen),len(MuMatrix))) 
    BigSigInv = np.linalg.pinv(BigSigma)
    for  C in range(0,len(MuMatrix)):
        for R in range(0,int(TrainingLen)):
            PHI[R][C] = GetRadialBasisOut(DataT[R], MuMatrix[C], BigSigInv)
    #print ("PHI Generated..")
    return PHI

def GetValTest(VAL_PHI,W):
    Y = np.dot(W,np.transpose(VAL_PHI))
    ##print ("Test Out Generated..")
    return Y



In [63]:
def gradient_descent_lin_reg(TRAINING_PHI,y,m,alpha=0.01,reg_param=2,iterations=10000):

    W_Now        = np.zeros((m))
    La           = reg_param
    learningRate = alpha
    print(TRAINING_PHI.shape)
    for i in tqdm(range(0,iterations)):
        #print ('---------Iteration: ' + str(i) + '--------------') 
        Delta_E_D     = -np.dot(TRAINING_PHI.T,(y - np.dot(TRAINING_PHI,W_Now))) # Computes the vectorised form of partial derivative of cost function with respect to the weights.  
        La_Delta_E_W  = np.dot(La,W_Now) #Computes the partial derivate of regularization term
        Delta_E       = np.add(Delta_E_D,La_Delta_E_W)  # Sums up above two derivatives to give the complete derivate of a regularized cost function.
        Delta_W       = -np.dot(learningRate,Delta_E) # Multiplies the derivative with learning rate
        W_T_Next      = W_Now + (Delta_W/y.shape[0]) #Updates the weights by adding the above calculated derivative.
        W_Now         = W_T_Next
    return W_Now

In [67]:
def print_metrics_lin_reg(PHI,y,W,name):
    Prediction    = GetValTest(PHI,W) 
    Erms          = GetErms(Prediction,y)
    L_Erms        = float(Erms.split(',')[1])
    print("E_rms " + name + ": " + str(np.around(L_Erms,5)))
    return

In [68]:
def LinearRegression(X_train,y_train,X_cv,y_cv,X_test,y_test,m=10,learning_rate=0.01,la=2,iterate=1000000):
    
    kmeans = KMeans(n_clusters=m, random_state=0).fit(X_train) # Cluster the data to create centroids (mean values) which will be used to compute gaussian kernals 
    Mu = kmeans.cluster_centers_ #matrix containing the mean values of features for every data point in each cluster
    print(Mu.shape)
    BigSigma     = GenerateBigSigma(X_train.T, Mu, 100) # compute the covariance Matrix
    TRAINING_PHI = GetPhiMatrix(X_train.T, Mu, BigSigma, 100) # Compute the Gaussian radial basis for training data.
    TEST_PHI     = GetPhiMatrix(X_test.T, Mu, BigSigma, 100) 
    VAL_PHI      = GetPhiMatrix(X_cv.T, Mu, BigSigma, 100)
    W_trained    = gradient_descent_lin_reg(TRAINING_PHI,y_train,m,alpha=learning_rate,reg_param=la,iterations=iterate)
   
    train_pred = np.dot(TRAINING_PHI,W_trained)
    cv_pred = np.dot(VAL_PHI,W_trained)
    test_pred = np.dot(TEST_PHI,W_trained)
    
    print("Train Accuracy: "+str(accuracy(train_pred>=0.5,y_train)))
    print("CV Accuracy: "+str(accuracy(cv_pred>=0.5,y_cv)))
    print("Test Accuracy: "+str(accuracy(test_pred>=0.5,y_test)))
             
    print_metrics_lin_reg(TRAINING_PHI,y_train,W_trained,"TRAINING")
    print_metrics_lin_reg(VAL_PHI,y_cv,W_trained,"VALIDATION")
    print_metrics_lin_reg(TEST_PHI,y_test,W_trained,"TEST")
    return W_trained

##### Concatenate Data

In [69]:
w_c = LinearRegression(X_train_hod_con[:,1:],y_train_hod_con,X_cv_hod_con[:,1:],y_cv_hod_con,X_test_hod_con[:,1:],y_test_hod_con)


  0%|          | 3605/1000000 [00:00<00:27, 36042.82it/s]

(10, 18)
(954, 10)


100%|██████████| 1000000/1000000 [00:22<00:00, 43656.82it/s]

Train Accuracy: 0.6750524109014675
CV Accuracy: 0.6383647798742138
Test Accuracy: 0.7053291536050157
E_rms TRAINING: 0.49051
E_rms VALIDATION: 0.49174
E_rms TEST: 0.48871





##### Subtracted Data

In [70]:
w_s = LinearRegression(X_train_hod_sub[:,1:],y_train_hod_sub,X_cv_hod_sub[:,1:],y_cv_hod_sub,X_test_hod_sub[:,1:],y_test_hod_sub)


  0%|          | 0/1000000 [00:00<?, ?it/s]

(10, 9)
(954, 10)


100%|██████████| 1000000/1000000 [00:22<00:00, 44529.68it/s]

Train Accuracy: 0.8427672955974843
CV Accuracy: 0.8113207547169812
Test Accuracy: 0.8369905956112853
E_rms TRAINING: 0.46094
E_rms VALIDATION: 0.46586
E_rms TEST: 0.46029





### GSC Data Set

#### Logistic Regression Model:

##### Concatenated

In [71]:
X_train_gsc_con,y_train_gsc_con,X_cv_gsc_con,y_cv_gsc_con,X_test_gsc_con,y_test_gsc_con = getGSCData_Concatenated()

X_train_gsc_con = insert_intercept(X_train_gsc_con)
X_cv_gsc_con = insert_intercept(X_cv_gsc_con)
X_test_gsc_con = insert_intercept(X_test_gsc_con)

In [72]:
W_trained_gsc_con = LogisticRegression(X_train_gsc_con,y_train_gsc_con,iterate=10000)

100%|██████████| 10000/10000 [33:54<00:00, 17.15it/s]    


In [73]:
train_loss, train_loss_logistic,train_accuracy = get_logisticRegression_metrics(X_train_gsc_con,y_train_gsc_con,W_trained_gsc_con)
cv_loss,cv_loss_logistic,cv_accuracy = get_logisticRegression_metrics(X_cv_gsc_con,y_cv_gsc_con,W_trained_gsc_con)
test_loss,test_loss_logistic,test_accuracy = get_logisticRegression_metrics(X_test_gsc_con,y_test_gsc_con,W_trained_gsc_con)

print(f'Logistic Regression Loss On Training Data: {train_loss_logistic}')
print(f'Loss On Training Data: {train_loss}\nAccuracy On Training Data: {train_accuracy}')
print(f'Loss On CV Data: {cv_loss}\nAccuracy On CV Data: {cv_accuracy}')
print(f'Loss On Test Data: {test_loss}\nAccuracy On Test Data: {test_accuracy}')

Logistic Regression Loss On Training Data: 0.35762210364924907
Loss On Training Data: 0.3275012269540261
Accuracy On Training Data: 0.8722682830534848
Loss On CV Data: 0.3305439936666008
Accuracy On CV Data: 0.8683898836480178
Loss On Test Data: 0.3290761184026051
Accuracy On Test Data: 0.8703800466785104


##### Subtracted

In [74]:
X_train_gsc_sub,y_train_gsc_sub,X_cv_gsc_sub,y_cv_gsc_sub,X_test_gsc_sub,y_test_gsc_sub = getGSCData_Subtracted()

X_train_gsc_sub = insert_intercept(X_train_gsc_sub)
X_cv_gsc_sub  = insert_intercept(X_cv_gsc_sub)
X_test_gsc_sub  = insert_intercept(X_test_gsc_sub)

In [75]:
X_train_gsc_sub

array([[1., 0., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.]])

In [76]:
W_trained_gsc_sub = LogisticRegression(X_train_gsc_sub,y_train_gsc_sub,iterate=10000)

100%|██████████| 10000/10000 [11:21<00:00, 14.68it/s]  


In [77]:
train_loss,train_loss_logistic,train_accuracy = get_logisticRegression_metrics(X_train_gsc_sub,y_train_gsc_sub,W_trained_gsc_sub)
cv_loss,cv_loss_logistic,cv_accuracy = get_logisticRegression_metrics(X_cv_gsc_sub,y_cv_gsc_sub,W_trained_gsc_sub)
test_loss,test_Loss_logistic,test_accuracy = get_logisticRegression_metrics(X_test_gsc_sub,y_test_gsc_sub,W_trained_gsc_sub)

print(f'Logistic Regression Loss On Training Data: {train_loss_logistic}')
print(f'Loss On Training Data: {train_loss}\nAccuracy On Training Data: {train_accuracy}')
print(f'Loss On CV Data: {cv_loss}\nAccuracy On CV Data: {cv_accuracy}')
print(f'Loss On Test Data: {test_loss}\nAccuracy On Test Data: {test_accuracy}')

Logistic Regression Loss On Training Data: 0.35709949920804535
Loss On Training Data: 0.32449612369421893
Accuracy On Training Data: 0.8888734062565318
Loss On CV Data: 0.32802699333681967
Accuracy On CV Data: 0.8861213683550477
Loss On Test Data: 0.3262736664936357
Accuracy On Test Data: 0.8868220294701641


#### Linear Regression Model

##### Concatenated

In [78]:
w_c_gsc = LinearRegression(X_train_gsc_con[:,1:],y_train_gsc_con,X_cv_gsc_con[:,1:],y_cv_gsc_con,X_test_gsc_con[:,1:],y_test_gsc_con,iterate=10000)


(10, 1024)


  1%|          | 111/10000 [00:00<00:08, 1102.78it/s]

(86118, 10)


100%|██████████| 10000/10000 [00:09<00:00, 1105.68it/s]


Train Accuracy: 0.5192294293875844
CV Accuracy: 0.5176269769386191
Test Accuracy: 0.5193855157278713
E_rms TRAINING: 0.55057
E_rms VALIDATION: 0.55195
E_rms TEST: 0.55079


##### Subtracted

In [79]:
w_s_gsc = LinearRegression(X_train_gsc_sub[:,1:],y_train_gsc_sub,X_cv_gsc_sub[:,1:],y_cv_gsc_sub,X_test_gsc_sub[:,1:],y_test_gsc_sub,iterate=10000)


(10, 512)


  2%|▏         | 228/10000 [00:00<00:08, 1145.87it/s]

(86118, 10)


100%|██████████| 10000/10000 [00:08<00:00, 1117.52it/s]


Train Accuracy: 0.5545646670846978
CV Accuracy: 0.5523583919738034
Test Accuracy: 0.556588985264918
E_rms TRAINING: 0.51973
E_rms VALIDATION: 0.52044
E_rms TEST: 0.51994
