<a href="https://colab.research.google.com/github/Yussefayman/NLP/blob/main/RuleBasedSentimentClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
def extract_features(x:str) -> dict[str,float]:
  features = {}
  x_split = x.split(' ')
  # Count the number of 'Good Words' and 'Bad Words' in a text
  good_words = ['love','good','nice','great','enjoy','enjoyed']
  bad_words = ['hate','bad','terrible','disappointing','sad','lost']
  for x_word in x_split:
    if x_word in good_words:
      features['good_word_count'] = features.get('good_word_count',0) + 1
    if x_word in bad_words:
      features['bad_word_count'] = features.get('bad_word_count',0) + 1

  features['bias'] = 1
  return features

feature_weights = {'good_word_count': 1.0, 'bad_word_count': -1.0, 'bias': 0.5}


In [3]:
def read_data(filename:str) ->tuple[list[str],list[int]]:
  x_data = []
  y_data = []
  with open(filename,'r') as f:
    for line in f:
      label, text = line.strip().split(' ||| ')
      x_data.append(text)
      y_data.append(int(label))

  return x_data, y_data

In [6]:
x_train, y_train = read_data('data/train.txt')
x_test, y_test = read_data('data/dev.txt')

In [8]:
x_train[0],y_train[0]

("The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal .",
 1)

In [9]:
def run_classifier(x: str) -> int:
  score = 0
  for feat_name, feat_value in extract_features(x).items():
    score = score + feat_value * feature_weights.get(feat_name,0)
  if score > 0:
    return 1
  elif score < 0:
    return -1
  else:
    return 0

In [10]:
def calculate_accuracy(x_data: list[str], y_data: list[int]) -> float:
  total_number = 0
  correct_number = 0
  for x,y in zip(x_data,y_data):
    y_pred = run_classifier(x)
    total_number+=1
    if y == y_pred:
      correct_number +=1
  return correct_number / float(total_number)

In [11]:
label_count = {}
for y in y_test:
  if y not in label_count:
    label_count[y] = 0
  label_count[y]+=1
print(label_count)

{1: 444, 0: 229, -1: 428}


In [12]:
train_accuracy = calculate_accuracy(x_train, y_train)
test_accuracy = calculate_accuracy(x_test, y_test)
print(f'Train accuracy: {train_accuracy}')
print(f'Dev/test accuracy: {test_accuracy}')

Train accuracy: 0.43433988764044945
Dev/test accuracy: 0.4223433242506812


In [13]:
import random
def find_errors(x_data,y_data):
  error_ids = []
  y_preds = []
  for i, (x,y) in enumerate(zip(x_data,y_data)):
    y_preds.append(run_classifier(x))
    if y!=y_preds[-1]:
      error_ids.append(i)
  for _ in range(5):
    my_id = random.choice(error_ids)
    x,y,y_pred = x_data[my_id], y_data[my_id], y_preds[my_id]
    print(f'{x}\ntrue label: {y}\npredicted label: {y_pred}\n')


In [14]:
find_errors(x_train, y_train)


Richard Pryor mined his personal horrors and came up with a treasure chest of material , but Lawrence gives us mostly fool 's gold .
true label: -1
predicted label: 1

I liked the original short story but this movie , even at an hour and twenty-some minutes , it 's too long and it goes nowhere .
true label: -1
predicted label: 1

An admitted egomaniac , Evans is no Hollywood villain , and yet this grating showcase almost makes you wish he 'd gone the way of Don Simpson .
true label: -1
predicted label: 1

The whole affair is as predictable as can be .
true label: 0
predicted label: 1

Watching Harris ham it up while physically and emotionally disintegrating over the course of the movie has a certain poignancy in light of his recent death , but Boyd 's film offers little else of consequence .
true label: 0
predicted label: 1

