In [18]:
print()
print("*********************************")
print("* Logistic Regression (RoBERTa) *")
print("*********************************")


*********************************
* Logistic Regression (RoBERTa) *
*********************************


In [3]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import os
import math

In [4]:
train=pd.read_csv(f'data/roberta/roberta.train.csv')
train.iloc[:,-1:]=train.iloc[:,-1:].replace(0,-1)

In [5]:
test=pd.read_csv(f'data/roberta/roberta.test.csv')
test.iloc[:,-1:]=test.iloc[:,-1:].replace(0,-1)

In [6]:
x_train=train.iloc[:,:-1]
y_train=train.iloc[:,-1]
x_test=test.iloc[:,:-1]
y_test=test.iloc[:,-1]

In [7]:
def get_w_b(size):
    np.random.seed(42)
    w=np.random.normal(-0.01,0.01,size)
    w=np.array(w)
    b=np.random.normal(-0.01,0.01)
    return w,b

In [8]:
def predict(x,weights,bias):
    y_pred = []
    for i in x.values: 
        if (np.dot(weights.transpose(), i)+bias) <= 0:
            pred = -1 
        else:
            pred = 1
        y_pred.append(pred)
    return y_pred

In [9]:
def accuracy(y_pred,y):
    count=0
    for i in range(len(y)):
        if(y_pred[i]==y[i]):
            count+=1
    return count/len(y)

In [10]:
def fit(epochs,x_train,y_train,lr,w,b,c,x_developmental=None,y_developmental=None,flag=False):
    initial_lr=lr
    ilist=[]
    devlist=[]
    weight_dict={}
    bias_dict={}
    accuracy_dict={}
    updates=0
    index = np.arange(x_train.shape[0])
    for i in range(epochs):
        lr=initial_lr/(1+i)
        np.random.seed(i)
        np.random.shuffle(index)
        for j in index:
            x=x_train.iloc[j]
            y=y_train[j]
            if(y*(np.dot(w.transpose(),x)))<=1:
                w=w*(1-lr)+lr*c*y*x
                b=b*(1-lr)+lr*c*y
                updates+=1
            else:
                w=w*(1-lr)
                b=b*(1-lr)
                updates+=1
            weights=w
            bias=b
            learning_rate=lr
        if(flag):
            y_pred=predict(x_developmental,weights,bias)
            ilist.append(i)
            devacc=accuracy(y_pred,y_developmental.values.tolist())
            weight_dict[i+1]=w
            bias_dict[i+1]=b
            accuracy_dict[i+1]=devacc
            devlist.append(devacc)
    if(flag):
        return weights,bias,learning_rate,updates,weight_dict,bias_dict,accuracy_dict,ilist,devlist
        
    return weights,bias,learning_rate

In [11]:
length=int(x_train.shape[0] * 0.8)
new_x_train=x_train.head(length)
new_y_train=y_train.head(length)
x_val=x_train.tail(x_train.shape[0]-length)
y_val=y_train.tail(x_train.shape[0]-length)

In [12]:
def k_cross_validation(learning_rates,tradeoffs):
    epochs=20
    val_accuracy={}
    target=new_y_train
    features=new_x_train
    test_label=y_val
    test_features=x_val
    for lr in learning_rates:
        for c in tradeoffs:
            print(lr,c)
            w,b=get_w_b(features.shape[1])
            weights,bias,f_lr=fit(epochs,new_x_train,new_y_train,lr,w,b,c)
            y_pred_val=predict(x_val,weights,bias)
            validation_accuracy=accuracy(y_pred_val,y_val.values.tolist())
            print(validation_accuracy)
            val_accuracy[(lr,c)]=validation_accuracy

    return val_accuracy

In [13]:
learning_rates=[1,0.1,0.01,0.001,0.0001]
tradeoffs=[0.1,1,10,100,1000]

val_accuracy=k_cross_validation(learning_rates,tradeoffs)

1 0.1
0.5021156558533145
1 1
0.5021156558533145
1 10
0.5021156558533145
1 100
0.5021156558533145
1 1000
0.5021156558533145
0.1 0.1
0.4978843441466855
0.1 1
0.5021156558533145
0.1 10
0.5021156558533145
0.1 100
0.5021156558533145
0.1 1000
0.5021156558533145
0.01 0.1
0.4978843441466855
0.01 1
0.4978843441466855
0.01 10
0.5021156558533145
0.01 100
0.5021156558533145
0.01 1000
0.5021156558533145
0.001 0.1
0.5021156558533145
0.001 1
0.5021156558533145
0.001 10
0.4978843441466855
0.001 100
0.5035260930888575
0.001 1000
0.5126939351198871
0.0001 0.1
0.5021156558533145
0.0001 1
0.5021156558533145
0.0001 10
0.5021156558533145
0.0001 100
0.5063469675599436
0.0001 1000
0.7983074753173484


In [14]:
print("Validation accuracies using Logistic Regression:")
print()
for i in learning_rates:
    for j in tradeoffs:
        print("Crossvalidation for (learning rate, tradeoff):(",i,",",j,")")
        print("Test accuracy:", val_accuracy[(i,j)])
        print()

best_accuracy=-1
for i in learning_rates:
    for j in tradeoffs:
        if best_accuracy<val_accuracy[(i,j)]:
            best_accuracy=val_accuracy[(i,j)]
            best_lr=i
            best_c=j

print("Best learning rate using Logistic Regression:", best_lr)
print("Best tradeoff using Logistic Regression:", best_c)
print("Best validation accuracy using Logistic Regression:", best_accuracy)
print()

Validation accuracies using Logistic Regression:

Crossvalidation for (learning rate, tradeoff):( 1 , 0.1 )
Test accuracy: 0.5021156558533145

Crossvalidation for (learning rate, tradeoff):( 1 , 1 )
Test accuracy: 0.5021156558533145

Crossvalidation for (learning rate, tradeoff):( 1 , 10 )
Test accuracy: 0.5021156558533145

Crossvalidation for (learning rate, tradeoff):( 1 , 100 )
Test accuracy: 0.5021156558533145

Crossvalidation for (learning rate, tradeoff):( 1 , 1000 )
Test accuracy: 0.5021156558533145

Crossvalidation for (learning rate, tradeoff):( 0.1 , 0.1 )
Test accuracy: 0.4978843441466855

Crossvalidation for (learning rate, tradeoff):( 0.1 , 1 )
Test accuracy: 0.5021156558533145

Crossvalidation for (learning rate, tradeoff):( 0.1 , 10 )
Test accuracy: 0.5021156558533145

Crossvalidation for (learning rate, tradeoff):( 0.1 , 100 )
Test accuracy: 0.5021156558533145

Crossvalidation for (learning rate, tradeoff):( 0.1 , 1000 )
Test accuracy: 0.5021156558533145

Crossvalidatio

In [17]:
w,b=get_w_b(x_train.shape[1])
epochs=20
weights,bias,lr,updates,weight_dict,bias_dict,accuracy_dict,x_axis,y_axis=fit(epochs,x_train,y_train,best_lr,w,b,best_c,x_val,y_val,True)
best_epoch = max(accuracy_dict, key=lambda x: accuracy_dict[x])
best_weights=weight_dict[best_epoch]
best_bias=bias_dict[best_epoch]
print()
print("Best epoch number with best development accuracy using Logistic Regression:",best_epoch)
y_pred=predict(x_train,best_weights,best_bias)
print("Train set accuracy using Logistic Regression:",accuracy(y_pred,y_train.values.tolist()))
y_pred=predict(x_test,best_weights,best_bias)
print("Test set accuracy using Logistic Regression:",accuracy(y_pred,y_test.values.tolist()))


Best epoch number with best development accuracy using Logistic Regression: 16
Train set accuracy using Logistic Regression: 0.8030474040632054
Test set accuracy using Logistic Regression: 0.7965766951942067


In [16]:
eval_set=pd.read_csv(f'data/roberta/roberta.eval.anon.csv')
eval_x_train=eval_set.iloc[:,:-1]

y_pred=predict(eval_x_train,best_weights,best_bias)
for i in range(len(y_pred)):
    if y_pred[i]==-1:
        y_pred[i]=0
result=pd.DataFrame(y_pred,columns=['label'],dtype=int)
result.index.name='example_id'
result.to_csv('lr_roberta_results.csv')