In [1]:
print()
print("****************************************")
print("* Average Perceptron (RoBERTa + TFIDF) *")
print("****************************************")


****************************************
* Average Perceptron (RoBERTa + TFIDF) *
****************************************


In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import os
import csv

In [2]:
train=pd.read_csv(f'data/roberta/roberta.train.csv')
new_train=pd.read_csv(f'data/tfidf/tfidf.train.csv')
train.iloc[:,-1:]=train.iloc[:,-1:].replace(0,-1)
new_train.iloc[:,-1:]=new_train.iloc[:,-1:].replace(0,-1)

In [3]:
test=pd.read_csv(f'data/roberta/roberta.test.csv')
new_test=pd.read_csv(f'data/tfidf/tfidf.test.csv')
test.iloc[:,-1:]=test.iloc[:,-1:].replace(0,-1)
new_test.iloc[:,-1:]=new_test.iloc[:,-1:].replace(0,-1)

In [4]:
x_train=train.iloc[:,:-1]
y_train=train.iloc[:,-1]
x_test=test.iloc[:,:-1]
y_test=test.iloc[:,-1]
new_x_train=new_train.iloc[:,:-1]
new_y_train=new_train.iloc[:,-1]
new_x_test=new_test.iloc[:,:-1]
new_y_test=new_test.iloc[:,-1]

In [5]:
for i in range(x_train.shape[1]):
    col = x_train.columns[i]
    x_train.rename(columns = {col:'roberta_'+col}, inplace = True)
    
for i in range(new_x_train.shape[1]):
    col = new_x_train.columns[i]
    new_x_train.rename(columns = {col:'tfidf_'+col}, inplace = True)
    
col = new_x_train[new_x_train.columns]
x_train = x_train.join(col)

for i in range(x_test.shape[1]):
    col = x_test.columns[i]
    x_test.rename(columns = {col:'roberta_'+col}, inplace = True)
    
for i in range(new_x_test.shape[1]):
    col = new_x_test.columns[i]
    new_x_test.rename(columns = {col:'tfidf_'+col}, inplace = True)
    
col = new_x_test[new_x_test.columns]
x_test = x_test.join(col)

In [6]:
def get_w_b(size):
    np.random.seed(42)
    w=np.random.normal(-0.01,0.01,size)
    w=np.array(w)
    b=np.random.normal(-0.01,0.01)
    a_w=w
    a_b=b
    return w,b,a_w,a_b

In [7]:
def predict(x,weights,bias):
    y_pred = []
    for i in x.values: 
        if (np.dot(weights.transpose(), i)+bias) < 0:
            pred = -1 
        else:
            pred = 1
        y_pred.append(pred)
    return y_pred

In [8]:
def accuracy(y_pred,y):
    count=0
    for i in range(len(y)):
        if(y_pred[i]==y[i]):
            count+=1
    return count/len(y)

In [9]:
def fit(epochs,x_train,y_train,lr,w,b,a_w,a_b,x_developmental=None,y_developmental=None,flag=False):
    ilist=[]
    devlist=[]
    weight_dict={}
    accuracy_dict={}
    bias_dict={}
    updates=0
    a_updates=0
    index = np.arange(x_train.shape[0])
    for i in range(epochs):
        np.random.seed(i)
        np.random.shuffle(index)
        for j in index:
            x=x_train.iloc[j]
            y=y_train[j]
            if(y*(np.dot(w.transpose(),x)+b))<0:
                w=w+lr*y*x
                b=b+lr*y
                updates+=1
            a_w+=w
            a_b+=b
            a_updates+=1
            weights=w
            bias=b
        if(flag):
            y_pred=predict(x_developmental,a_w,a_b)
            ilist.append(i)
            devacc=accuracy(y_pred,y_developmental.values.tolist())
            weight_dict[i+1]=a_w
            accuracy_dict[i+1]=devacc
            bias_dict[i+1]=a_b
            devlist.append(devacc)
    if(flag):
        return weights,bias,a_w,a_b,updates,a_updates,weight_dict,bias_dict,accuracy_dict,ilist,devlist
    return weights,bias,a_w,a_b

In [10]:
length=int(x_train.shape[0] * 0.8)
new_x_train=x_train.head(length)
new_y_train=y_train.head(length)
x_val=x_train.tail(x_train.shape[0]-length)
y_val=y_train.tail(x_train.shape[0]-length)

In [15]:
learning_rates=[0.01,0.1,1]
epochs=30
val_accuracy={}
for lr in learning_rates:
    print(lr)
    w,b,a_w,a_b=get_w_b(new_x_train.shape[1])
    weights,bias,a_w,a_b=fit(epochs,new_x_train,new_y_train,lr,w,b,a_w,a_b)
    y_pred_val=predict(x_val,a_w,a_b)
    validation_accuracy=accuracy(y_pred_val,y_val.values.tolist())
    val_accuracy[lr]=validation_accuracy
    print('done')
print(val_accuracy)

0.01
done
0.1
done
1
done
{0.01: 0.8349788434414669, 0.1: 0.8321579689703809, 1: 0.8335684062059239}


In [16]:
best_lr = max(val_accuracy, key=lambda x: val_accuracy[x])
print("Best learning rate:", best_lr)
w,b,a_w,a_b=get_w_b(x_train.shape[1])
epochs=30
weights,bias,a_w,a_b,updates,a_updates,weight_dict,bias_dict,accuracy_dict,x_axis,y_axis=fit(epochs,x_train,y_train,best_lr,w,b,a_w,a_b,x_val,y_val,True)
best_epoch = max(accuracy_dict, key=lambda x: accuracy_dict[x])
best_weights=weight_dict[best_epoch]
best_bias=bias_dict[best_epoch]
print("Best epoch number with best development accuracy using averaged perceptron:",best_epoch)
y_pred=predict(x_train,best_weights,best_bias)
print("Train set accuracy using averaged perceptron:",round(accuracy(y_pred,y_train.values.tolist()),6))
y_pred=predict(x_test,best_weights,best_bias)
print("Test set accuracy using averaged perceptron:",round(accuracy(y_pred,y_test.values.tolist()),6))
print("Number of updates on training set using averaged perceptron:",updates)

Best learning rate: 0.01
Best epoch number with best development accuracy using averaged perceptron: 30
Train set accuracy using averaged perceptron: 0.86329
Test set accuracy using averaged perceptron: 0.811718
Number of updates on training set using averaged perceptron: 62041


In [17]:
eval_set1=pd.read_csv(f'data/roberta/roberta.eval.anon.csv')
eval_set2=pd.read_csv(f'data/tfidf/tfidf.eval.anon.csv')

eval_x_train1=eval_set1.iloc[:,:-1]
eval_x_train2=eval_set2.iloc[:,:-1]

for i in range(eval_x_train1.shape[1]):
    col = eval_x_train1.columns[i]
    eval_x_train1.rename(columns = {col:'roberta_'+col}, inplace = True)
    
for i in range(eval_x_train2.shape[1]):
    col = eval_x_train2.columns[i]
    eval_x_train2.rename(columns = {col:'tfidf_'+col}, inplace = True)
    
col = eval_x_train2[eval_x_train2.columns]
eval_x_train1 = eval_x_train1.join(col)

y_pred=predict(eval_x_train1,best_weights,best_bias)
for i in range(len(y_pred)):
    if y_pred[i]==-1:
        y_pred[i]=0
result=pd.DataFrame(y_pred,columns=['label'],dtype=int)
result.index.name='example_id'
result.to_csv('perceptron_roberta_tfidf_results.csv')