<a href="https://colab.research.google.com/github/Yussefayman/NLP/blob/main/BagOfWordsClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import random
import tqdm

In [2]:
def extract_features(x: str) -> dict[str,float]:
  features = {}
  x_split = x.split(' ')
  for x in x_split:
    features[x] = features.get(x,0) + 1.0
  return features

In [3]:
feature_weights = {}

In [4]:
def read_data(filename: str) -> tuple[list[str],list[int]]:
  x_data = []
  y_data = []
  with open(filename,'r') as f:
    for line in f:
      label, text = line.strip().split(' ||| ')
      x_data.append(text)
      y_data.append(int(label))
  return x_data, y_data


In [6]:
x_train, y_train = read_data('data/train.txt')
x_dev, y_dev = read_data('data/dev.txt')

In [8]:
x_train[0],y_train[0]

("The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal .",
 1)

In [11]:
def run_classifier(features: dict[str, float]) -> int:
  score = 0
  for feat_name, feat_value in features.items():
    score = score + feat_value * feature_weights.get(feat_name,0)
  if score > 0:
    return 1
  elif score < 0:
    return -1
  else:
    return 0

In [20]:
epochs = 5
for epoch in range(1,epochs+1):
  data_ids = list(range(len(x_train)))
  random.shuffle(data_ids)
  for data_id in tqdm.tqdm(data_ids,desc=f'Epoch {epoch}'):
    x = x_train[data_id]
    y = y_train[data_id]
    if y == 0:
      continue
    features = extract_features(x)
    predicted_y = run_classifier(features)
    if predicted_y !=y:
      for feature in features:
        feature_weights.get(feature, 0)
        feature_weights[feature] = feature_weights.get(feature,0)+ y * features[feature]

Epoch 1: 100%|██████████| 8544/8544 [00:00<00:00, 88562.35it/s]
Epoch 2: 100%|██████████| 8544/8544 [00:00<00:00, 88849.55it/s]
Epoch 3: 100%|██████████| 8544/8544 [00:00<00:00, 91001.77it/s]
Epoch 4: 100%|██████████| 8544/8544 [00:00<00:00, 90332.41it/s]
Epoch 5: 100%|██████████| 8544/8544 [00:00<00:00, 82882.64it/s]


In [21]:
def calculate_accuracy(x_data: list[str], y_data: list[int]) -> float:
    total_number = 0
    correct_number = 0
    for x, y in zip(x_data, y_data):
        y_pred = run_classifier(extract_features(x))
        total_number += 1
        if y == y_pred:
            correct_number += 1
    return correct_number / float(total_number)

In [22]:
label_count = {}
for y in y_dev:
    if y not in label_count:
        label_count[y] = 0
    label_count[y] += 1
print(label_count)

{1: 444, 0: 229, -1: 428}


In [23]:
train_accuracy = calculate_accuracy(x_train, y_train)
test_accuracy = calculate_accuracy(x_dev, y_dev)
print(f'Train accuracy: {train_accuracy}')
print(f'Dev/test accuracy: {test_accuracy}')

Train accuracy: 0.7846441947565543
Dev/test accuracy: 0.5894641235240691


In [24]:
def find_errors(x_data, y_data):
    error_ids = []
    y_preds = []
    for i, (x, y) in enumerate(zip(x_data, y_data)):
        y_preds.append(run_classifier(extract_features(x)))
        if y != y_preds[-1]:
            error_ids.append(i)
    for _ in range(5):
        my_id = random.choice(error_ids)
        x, y, y_pred = x_data[my_id], y_data[my_id], y_preds[my_id]
        print(f'{x}\ntrue label: {y}\npredicted label: {y_pred}\n')

In [25]:
find_errors(x_dev, y_dev)


Sometimes it feels as if it might have been made in the '70s or '80s , and starred Chevy Chase and Goldie Hawn .
true label: 0
predicted label: -1

Yes , Ballistic is silly .
true label: 0
predicted label: -1

Exists then as an occasionally insightful acting exercise .
true label: 0
predicted label: -1

I 'm sure if you 're a Hartley fan , you might enjoy yourself ... Me , I did n't care for it .
true label: 0
predicted label: 1

It has the ability to offend and put off everyone , but it holds you with its outrageousness .
true label: 0
predicted label: 1

