## Data Preparation (for RoBERTa)

In [0]:
import csv

train = []

with open('train.csv', 'r') as file:
  csv_reader = csv.reader(file, delimiter=',')
  for row in csv_reader:
    if "\n" not in row[6]: # remove malformed example
      train.append([row[6], row[4]])

train = train[1:] # remove the first row

with open('train.tsv', 'w') as file:
  tsv_writer = csv.writer(file, delimiter='\t')
  tsv_writer.writerow(['sentence', 'label'])
  for row in train:
    tsv_writer.writerow(row)

dev = []

with open('dev.csv', 'r') as file:
  csv_reader = csv.reader(file, delimiter=',')
  for row in csv_reader:
    if "\n" not in row[6]: # remove malformed example
      dev.append([row[6], row[4]])

dev = dev[1:] #remove the first_row

with open('dev.tsv', 'w') as file:
  tsv_writer = csv.writer(file, delimiter='\t')
  tsv_writer.writerow(['sentence', 'label'])
  for row in dev:
    tsv_writer.writerow(row)

## Data Preparation (for DT)

In [0]:
import csv

train_text = []
train_label = []

with open('train.csv', 'r') as file:
  csv_reader = csv.reader(file, delimiter=',')
  for row in csv_reader:
    if "\n" not in row[6]: # remove malformed example
      train_label.append(row[4])
      train_text.append(row[6])

train_text = train_text[1:]
train_label = train_label[1:]

dev_text = []
dev_label = []

with open('dev.csv', 'r') as file:
  csv_reader = csv.reader(file, delimiter=',')
  for row in csv_reader:
    if "\n" not in row[6]: # remove malformed example
      dev_label.append(row[4])
      dev_text.append(row[6])

dev_text = dev_text[1:]
dev_label = dev_label[1:]

In [0]:
def preprocess_data(data):
    
    preprocessed_data = []
    
    for line in data:
        wordList = []
        for word in line.split(" "):
            wordList.append(word)
        preprocessed_data.append(wordList)
        
    return preprocessed_data

train_data = preprocess_data(train_text)
dev_data = preprocess_data(dev_text)

In [0]:
import numpy as np

class Vectorizer():
    def __init__(self, max_features):
        self.max_features = max_features
        self.vocab_list = None
        self.token_to_index = None

    def fit(self, dataset):
        wordDict = {}
        
        for line in dataset:
            for word in line: 
                if word in wordDict: 
                    wordDict[word] += 1
                else:
                    wordDict[word] = 1
                    
        sorted_wordDict = {k: v for (k, v) in sorted(wordDict.items(), key=lambda item: item[1], reverse=True)}
        self.vocab_list = [k for k in list(sorted_wordDict)[:self.max_features]]
        self.token_to_index = {k: v for (k, v) in zip(self.vocab_list, range(len(self.vocab_list)))}

    def transform(self, dataset):
        data_matrix = np.zeros((len(dataset), len(self.vocab_list)))
        
        for i in range(len(dataset)):
            line = dataset[i]
            for j in range(len(self.vocab_list)):
                if self.vocab_list[j] in line: 
                    data_matrix[i][j] = 1
                else:
                    data_matrix[i][j] = 0
        
        return data_matrix

In [0]:
max_features = 2000
vectorizer = Vectorizer(max_features=max_features)
vectorizer.fit(train_data)
X_train = vectorizer.transform(train_data)
X_dev = vectorizer.transform(dev_data)
y_train = np.array(train_label)
y_dev = np.array(dev_label)

## Model Preparation

In [0]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)
pred = clf.predict(X_dev)

In [37]:
def accuracy_score(y_true, y_pred): 
    correct = 0
    
    for i in range(len(y_true)):
        if y_true[i] == y_pred[i]:
            correct += 1
            
    accuracy = correct / len(y_true)
    return accuracy

print(accuracy_score(y_val, pred))

0.8224369326724954
