In [2]:
print()
print("****************************************")
print("* Boosted Perceptron (RoBERTa + TFIDF) *")
print("****************************************")


****************************************
* Boosted Perceptron (RoBERTa + TFIDF) *
****************************************


In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import os
import csv
import math

In [2]:
train=pd.read_csv(f'data/roberta/roberta.train.csv')
new_train=pd.read_csv(f'data/tfidf/tfidf.train.csv')
train.iloc[:,-1:]=train.iloc[:,-1:].replace(0,-1)
new_train.iloc[:,-1:]=new_train.iloc[:,-1:].replace(0,-1)

In [3]:
test=pd.read_csv(f'data/roberta/roberta.test.csv')
new_test=pd.read_csv(f'data/tfidf/tfidf.test.csv')
test.iloc[:,-1:]=test.iloc[:,-1:].replace(0,-1)
new_test.iloc[:,-1:]=new_test.iloc[:,-1:].replace(0,-1)

In [4]:
x_train=train.iloc[:,:-1]
y_train=train.iloc[:,-1]
x_test=test.iloc[:,:-1]
y_test=test.iloc[:,-1]
new_x_train=new_train.iloc[:,:-1]
new_y_train=new_train.iloc[:,-1]
new_x_test=new_test.iloc[:,:-1]
new_y_test=new_test.iloc[:,-1]

In [5]:
for i in range(x_train.shape[1]):
    col = x_train.columns[i]
    x_train.rename(columns = {col:'roberta_'+col}, inplace = True)
    
for i in range(new_x_train.shape[1]):
    col = new_x_train.columns[i]
    new_x_train.rename(columns = {col:'tfidf_'+col}, inplace = True)
    
col = new_x_train[new_x_train.columns]
x_train = x_train.join(col)

for i in range(x_test.shape[1]):
    col = x_test.columns[i]
    x_test.rename(columns = {col:'roberta_'+col}, inplace = True)
    
for i in range(new_x_test.shape[1]):
    col = new_x_test.columns[i]
    new_x_test.rename(columns = {col:'tfidf_'+col}, inplace = True)
    
col = new_x_test[new_x_test.columns]
x_test = x_test.join(col)

In [6]:
def get_w_b(size):
    np.random.seed(42)
    w=np.random.normal(-0.01,0.01,size)
    w=np.array(w)
    b=np.random.normal(-0.01,0.01)
    return w,b

In [7]:
def predict(x,weights,bias):
    y_pred = []
    for i in x.values: 
        if (np.dot(weights.transpose(), i)+bias) < 0:
            pred = -1 
        else:
            pred = 1
        y_pred.append(pred)
    return y_pred

In [8]:
def accuracy(y_pred,y):
    count=0
    for i in range(len(y)):
        if(y_pred[i]==y[i]):
            count+=1
    return count/len(y)

In [9]:
def fit(epochs,x_train,y_train,lr,w,b):
    ilist=[]
    devlist=[]
    weight_dict={}
    accuracy_dict={}
    bias_dict={}
    index = np.arange(x_train.shape[0])
    for i in range(epochs):
        np.random.seed(i)
        np.random.shuffle(index)
        for j in index:
            x=x_train.iloc[j]
            y=y_train[j]
            if(y*(np.dot(w.transpose(),x)+b))<0:
                w=w+lr*y*x
                b=b+lr*y
            weights=w
            bias=b
    return weights,bias

In [10]:
def calculate_error(y_pred,y,d_list):
    error=0
    for i in range(len(y_pred)):
        if(y_pred[i]!=y[i]):
            error+=d_list[i]
    return error

In [11]:
def update_d_list(d_list,alpha,y_pred,y_train):
    for i in range(len(y_pred)):
        if(y_pred[i]==y_train[i]):
            d_list[i]=d_list[i]*math.exp(-1*alpha)
        else:
            d_list[i]=d_list[i]*math.exp(alpha)
    z=np.sum(d_list)
    for i in d_list:
        i=i/z
    return d_list

In [12]:
def boost(epochs,x_train,y_train,lr,w,b):
    size=x_train.shape[0]
    d_list=np.ones(size)/size
    alpha_list=[]
    weight_list=[]
    bias_list=[]
    for i in range(epochs):
        f_w,f_b=fit(20,x_train,y_train,lr,w,b)
        y_pred=predict(x_train,f_w,f_b)
        error=calculate_error(y_pred,y_train.values.tolist(),d_list)
        alpha=0.5*np.log((1-error)/error)
        alpha_list.append(alpha)
        d_list=update_d_list(d_list,alpha,y_pred,y_train)
        weight_list.append(f_w)
        bias_list.append(f_b)
    return alpha_list,weight_list,bias_list

In [13]:
def predict_boosting(alpha_list,weight_list,bias_list,x):
    y_pred=[]
    for i in range(x.shape[0]):
        pred=0
        for j in range(len(alpha_list)):
            pred+=alpha_list[j]*(np.dot(weight_list[j].transpose(),x.iloc[i])+bias_list[j])
        if pred<0:
            y_pred.append(-1)
        else:
            y_pred.append(1)
    return y_pred

In [14]:
length=int(x_train.shape[0] * 0.8)
new_x_train=x_train.head(length)
new_y_train=y_train.head(length)
x_val=x_train.tail(x_train.shape[0]-length)
y_val=y_train.tail(x_train.shape[0]-length)

In [15]:
learning_rates=[0.01,0.1,1]
epochs=20
val_accuracy={}
for lr in learning_rates:
    print(lr)
    w,b=get_w_b(new_x_train.shape[1])
    alpha_list,weight_list,bias_list=boost(epochs,new_x_train,new_y_train,lr,w,b)
    y_pred_val=predict_boosting(alpha_list,weight_list,bias_list,x_val)
    validation_accuracy=accuracy(y_pred_val,y_val.values.tolist())
    val_accuracy[lr]=validation_accuracy
print(val_accuracy)

0.01
0.1
1
{0.01: 0.8363892806770099, 0.1: 0.8293370944992948, 1: 0.8356840620592384}


In [16]:
best_lr = max(val_accuracy, key=lambda x: val_accuracy[x])
print("Best learning rate:", best_lr)
w,b=get_w_b(x_train.shape[1])
epochs=20
alpha_list,weight_list,bias_list=boost(epochs,x_train,y_train,best_lr,w,b)
y_pred=predict_boosting(alpha_list,weight_list,bias_list,x_train)
print("Train set accuracy using averaged perceptron:",round(accuracy(y_pred,y_train.values.tolist()),6))
y_pred=predict_boosting(alpha_list,weight_list,bias_list,x_test)
print("Test set accuracy using averaged perceptron:",round(accuracy(y_pred,y_test.values.tolist()),6))

Best learning rate: 0.01
Train set accuracy using averaged perceptron: 0.873448
Test set accuracy using averaged perceptron: 0.818302


In [17]:
eval_set1=pd.read_csv(f'data/roberta/roberta.eval.anon.csv')
eval_set2=pd.read_csv(f'data/tfidf/tfidf.eval.anon.csv')

eval_x_train1=eval_set1.iloc[:,:-1]
eval_x_train2=eval_set2.iloc[:,:-1]

for i in range(eval_x_train1.shape[1]):
    col = eval_x_train1.columns[i]
    eval_x_train1.rename(columns = {col:'roberta_'+col}, inplace = True)
    
for i in range(eval_x_train2.shape[1]):
    col = eval_x_train2.columns[i]
    eval_x_train2.rename(columns = {col:'tfidf_'+col}, inplace = True)
    
col = eval_x_train2[eval_x_train2.columns]
eval_x_train1 = eval_x_train1.join(col)

y_pred=predict_boosting(alpha_list,weight_list,bias_list,eval_x_train1)
for i in range(len(y_pred)):
    if y_pred[i]==-1:
        y_pred[i]=0
result=pd.DataFrame(y_pred,columns=['label'],dtype=int)
result.index.name='example_id'
result.to_csv('boosting_perceptron_roberta_tfidf_results.csv')