In [1]:
from google.colab import drive
import os
import pandas as pd
import numpy as np

#Load the dataset
drive.mount('/content/drive',force_remount=True)
file_path = "/content/drive/My Drive/Problem_Statements/datasets/alphabets_28x28.csv"
# Function to convert label column
def convert_label(x):
    if isinstance(x, str) and len(x) == 1 and x.isalpha():
        return ord(x) - ord('A')
    else:
        return np.nan
# Function to convert other columns
def convert_other_columns(x):
    try:
        return int(x)
    except ValueError:
        return np.nan
# Read the CSV file into a DataFrame
df = pd.read_csv(file_path, converters={
    'label': convert_label,
    **{col: convert_other_columns for col in pd.read_csv(file_path, nrows=1).columns if col != 'label'}
})
#print(df.head())
print(df.shape)
# Remove unwanted rows
df = df.dropna()
# Remove duplicate rows based on all columns
df = df.drop_duplicates()
print(df.shape)

Mounted at /content/drive
(372451, 785)
(201019, 785)


In [2]:
# Spilting of data into training, testing and validation dataset (equally distributed)
num_labels = 26
# Splitting ratios (adjust as needed)
train_ratio = 0.8
val_ratio = 0.1  # Remaining 0.1 goes to test set automatically
# Initialize empty lists for train, test, and val sets
train_indices = []
val_indices = []
# Split data by label and then split each label's data
for label in range(num_labels):
    label_df = df[df['label'] == label]
    # Randomly select indices for training
    train_idx = np.random.choice(label_df.index, size=int(0.8*len(label_df)), replace=False)
    train_indices.extend(train_idx)
    # Remove selected indices from label_df
    label_df = label_df.drop(train_idx)
    # Randomly select indices for validation
    val_idx = np.random.choice(label_df.index, size=int(0.5*len(label_df)), replace=False)
    val_indices.extend(val_idx)
    # Remaining indices go to test automatically
# Create train, test, val DataFrames
train_df = df.loc[train_indices]
val_df = df.loc[val_indices]
test_df = df.drop(train_indices).drop(val_indices)
print(train_df.shape,test_df.shape,val_df.shape)

(160807, 785) (20112, 785) (20100, 785)


In [3]:
# Spilting the dataset into input and output labels
X_train = train_df.drop('label',axis=1)
y_train = train_df['label']
X_test = test_df.drop('label',axis=1)
y_test = test_df['label']
X_val = val_df.drop('label',axis=1)
y_val = val_df['label']
print(X_train.shape,y_train.shape,X_test.shape,y_test.shape,X_val.shape,y_val.shape)

(160807, 784) (160807,) (20112, 784) (20112,) (20100, 784) (20100,)


In [4]:
import tensorflow as tf
X_train = tf.keras.utils.normalize(X_train,axis=1)
X_test = tf.keras.utils.normalize(X_test,axis=1)
X_val = tf.keras.utils.normalize(X_val,axis=1)
print(set(y_train),len(set(y_train)))
print(X_train.shape,y_train.shape,X_test.shape,y_test.shape,X_val.shape,y_val.shape)

{0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0} 26
(160807, 784) (160807,) (20112, 784) (20112,) (20100, 784) (20100,)


In [5]:
model = tf.keras.models.Sequential()
# no need for flattening as it is already flattened
model.add(tf.keras.layers.Dense(512,activation='relu',input_shape = (784,)))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(512,activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(26,activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Adding early stopping with patience
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',  # Monitors the validation loss
    patience=5,  # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=True  # Restore model weights from the epoch with the best value of the monitored quantity
)
# Training the model with early stopping
model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val), callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7c915447ab60>

In [6]:
#Evaluating the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

Test Loss: 0.09116826206445694
Test Accuracy: 0.9736475944519043


In [7]:
import matplotlib.pyplot as plt
# Functions to display image if required
def display(im_data):
  dpi = 80
  height, width = im_data.shape
  figsize = width / float(dpi), height / float(dpi)
  fig = plt.figure(figsize=figsize)
  ax = fig.add_axes([0, 0, 1, 1])
  ax.axis('off')
  ax.imshow(im_data, cmap='gray')
  plt.show()
def display2(im_data):
  dpi = 80
  height, width, depth = im_data.shape
  figsize = width / float(dpi), height / float(dpi)
  fig = plt.figure(figsize=figsize)
  ax = fig.add_axes([0, 0, 1, 1])
  ax.axis('off')
  ax.imshow(im_data, cmap='gray')
  plt.show()

In [8]:
import cv2 as cv
# Pre-processing the image as required by the dataset
def preprocess_image(image_path):
    img = cv.imread(image_path)
    #print(image_path)
    #display2(img)
    #print(img.shape)
    img = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
    #print(img.shape)
    #display(img)
    thresh, im_bw = cv.threshold(img, 0, 255, cv.THRESH_BINARY | cv.THRESH_OTSU)
    #print(im_bw.shape)
    #display(im_bw)
    return im_bw

In [9]:
# Function to define if the contour is valid or not
def is_valid_contour(contour):
    x, y, w, h = cv.boundingRect(contour)
    aspect_ratio = w / float(h)
    if w < 7 or h < 7:  # Minimum size
        return False
    if w > 100 or h > 100:  # Maximum size
        return False
    return True
# Function which find valid contours in processed image and sorts it line wise
def get_contours_line_wise(im_bw):
  contours, _ = cv.findContours(im_bw, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)
  # Filter out useless contours
  contours = [c for c in contours if is_valid_contour(c)]
  # Sort contours by their bounding rectangle's top-left corner coordinates (x, y)
  contours = sorted(contours, key=lambda c: (cv.boundingRect(c)[1], cv.boundingRect(c)[0]))
  # Group contours into lines based on y-coordinate with a threshold
  threshold = 10  # Set your threshold value
  lines = []
  current_line = []
  previous_y = -1
  for contour in contours:
      x, y, w, h = cv.boundingRect(contour)
      if previous_y == -1:
          previous_y = y
      # If the current contour is not on the same line as the previous one
      if abs(y - previous_y) > threshold:
          lines.append(current_line)
          current_line = []
      current_line.append(contour)
      previous_y = y
      previous_x = x
  # Don't forget to add the last line
  if current_line:
      lines.append(current_line)
  #print(len(lines))
  #img = cv.imread(image_path)
  #contour_img = cv.drawContours(img.copy(), contours, -1, (0, 255, 0), 2)
  #display2(contour_img)
  return lines

In [10]:
# Function to get letter-wise images from line-wise contours
def get_letters(lines):
  letters = []
  th_x = 25;
  max_y = 0;
  space_marker = np.zeros((28, 28))  # Placeholder for space
  for line in lines:
      previous_x = 0
      sorted_line = sorted(line, key=lambda c: cv.boundingRect(c)[0])
      for contour in sorted_line:
          x, y, w, h = cv.boundingRect(contour)
          if (x - previous_x) > th_x:
              letters.append(space_marker)
          # Get the bounding rectangle of the contour.
          previous_x = x + w  # Update previous_x to the end of the current bounding box
          letter = im_bw[y:y + h, x:x + w]
          letter = cv.copyMakeBorder(letter, 5, 5, 5, 5, cv.BORDER_CONSTANT, value=0)
          # Resize the letter to a standard size.
          letter = cv.resize(letter, (28, 28))
          # Add the letter to the list of identified letters.
          letters.append(letter)
      if (max_y - previous_x) > th_x:
          letters.append(space_marker)
      max_y = max(max_y, previous_x)
  #fig = plt.figure(figsize=(120, 1))
  #for i, letter in enumerate(letters):
    #plt.subplot(1, len(letters), i + 1)
    #plt.imshow(letter, cmap='gray')
    #plt.axis('off')
  #plt.show()
  #print(len(letters))
  return letters

In [11]:
# Function to predict the sentence letter by letter using list of letter images
def create_sentence(letters):
  sentence = ""
  # Process each letter and predict using the model
  for letter in letters:
      if np.all(letter == 0):  # Check if it's a space marker
          sentence += " "
          continue
      # Normalize the letter
      letter = tf.keras.utils.normalize(letter, axis=1)
      # Reshape the letter for the model prediction
      ch = model.predict(letter.reshape(1, 784))  # Ensure the shape is correct for the model
      # Get the predicted character
      c = np.argmax(ch)
      sentence += chr(c + ord('A'))
  return sentence

In [12]:
#Loading the lines images
dir_path = "/content/drive/My Drive/Problem_Statements/datasets/target_images"
lines = os.listdir(dir_path)
sentences = {}
for line in lines:
    image_path = dir_path + "/" + line
    im_bw = preprocess_image(image_path)
    lines = get_contours_line_wise(im_bw)
    letters = get_letters(lines)
    sentence = create_sentence(letters)
    #print(sentence)
    sentences[line] = sentence



In [13]:
# Predicted Sentences from the model
for line in sentences.keys():
  print(line,sentences[line])

line_5.png YOUR ANALYSIS OF THE DDTA WRS ACCURATE AND UELL PRESENTED PROVIOING A CCEAR UNOERSTANDING OF TNE TRENDS ANO PATTERNS 
line_3.png E AM DELIGNTEO BY YOUR FRIENDLINESS ANO YOU ALWAYS MAKE EVERYONE FEEL WELCONE WHICH FOSTECS O SENSE OF COAQUNITY 
line_6.png TNE MEETING MINUTES YOU PREPARED UORE DETAILED AND WELL ORGANIZED ACCURATELY REFLEC ING THE DISCUSSZONS AND DECISIONS MADE 
line_1.png I AM REALLY ANNOYED BY YOUP CONSTANT COMPLAINING AAD YOU NEVER OFEER AN SOCUTIONS WHICH IS VERY UNHECPFUL AND NEGATSVS 
line_2.png IT IS ERUSTRATEOS TNAT YOU NEVER PAY ATTENTION DURING JISCUSSIONS ANO YOUR LACK OF FOCUS IS RLOLLY AFFECTING OUR PROGRESS 
line_4.png IT SS UONDERFUL TNAT YOU ALWEYS SHOW KZNDNESS ANO YOUR EMPATHY TOWARDS OTHECS IS TRULY HEARTWARMINS AND APPRECIATED 


In [14]:
#Loading the lines of Naive-Bayes Method
dir_path = "/content/drive/My Drive/Problem_Statements/datasets/sentiment_analysis_dataset.csv"
df_lines = pd.read_csv(dir_path)
print(len(df_lines))

30


In [15]:
N_lines = len(df_lines)
C = list(set(df_lines['sentiment'].tolist()))
prior = {}
print(C)
for c in C:
  df_sent = df_lines[df_lines['sentiment'] == c]
  prior[c] = len(df_sent)/N_lines
print(prior)

['Neutral', 'Angry', 'Happy']
{'Neutral': 0.3333333333333333, 'Angry': 0.3333333333333333, 'Happy': 0.3333333333333333}


In [16]:
lines = df_lines['line'].tolist()
sentiments = df_lines['sentiment'].tolist()
l = set()
d = {}
neg = ['NO','NEVER','NOT'] #negative words
for (line,sentiment) in zip(lines,sentiments):
  n = False # is negative word occured till now
  words = line.split()
  for word in words:
    if n:
      word = "NOT_" + word
    if word in neg:
      n = not n
    l.add(word)
    if word in d:
      d[word][sentiment] += 1
    else:
      d[word] = {'Angry':0,'Neutral':0,'Happy':0}
      d[word][sentiment] += 1
print(len(l))
print(l)
token = pd.DataFrame.from_dict(d, orient='index')
# Display the DataFrame
print(token)

308
{'COMMUNICATION', 'PRESSURE', 'ADMIRABLE', 'FRESH', 'HURTFUL', 'KEY', 'USEFUL', 'WHICH', 'UNNECESSARY', 'HARD', 'NOT_YOUR', 'THROUGH', 'NOT_OF', 'COMMITMENTS', 'STAY', 'PEOPLE', 'LOT', 'EVERYONE', 'ARRIVE', 'NOT_INVOLVED', 'EVENT', 'WORKING', 'NOT_IT', 'PRESENTATION', 'NOT_EFFORT', 'PREPARED', 'NOT_ON', 'WELLWRITTEN', 'ACCORDING', 'UNDER', 'NOT_CONVERSATION', 'CANNOT', 'NOT_THROUGH', 'NOT_THE', 'TRULY', 'MESSAGES', 'ANGRY', 'THOROUGHNESS', 'YOU', 'DIFFERENCE', 'WITH', 'LISTEN', 'APPRECIATION', 'CLEAR', 'BELIEVE', 'ABILITY', 'INFURIATES', 'UPLIFTING', 'NOT_AND', 'FRUSTRATED', 'ATTENTION', 'NOT_RESPOND', 'FOLLOWED', 'THE', 'OTHERS', 'MAKE', 'NOT_LACK', 'FOCUSED', 'WHILE', 'NOT_WHOLE', 'CAUSING', 'NOT_YOU', 'TABLE', 'ATTITUDE', 'NOT_RESPECT', 'EMAIL', 'EASIER', 'MAJOR', 'ATTENDED', 'NOT_OTHERS', 'SUCCESS', 'YESTERDAY', 'NOT_LOT', 'VALUABLE', 'COVERING', 'WORK', 'NOT_VERY', 'NOT_PREPARE', 'WHEN', 'NOT_MISTAKES', 'THOROUGH', 'EVERYTHING', 'EXCUSES', 'MAKES', 'PRESENTING', 'TO', 'GREAT',

In [17]:
den_likelihood = {'Angry':0,'Neutral':0,'Happy':0}
for word in l:
  for c in C:
    den_likelihood[c] += d[word][c]+1
print(den_likelihood)

{'Angry': 538, 'Neutral': 490, 'Happy': 509}


In [18]:
def predict(line_name):
  pred_line = sentences[line_name]
  #print(pred_line)
  pred_words = pred_line.split()
  n = False
  words_used = set()
  for word in pred_words:
    if n:
      word = "NOT_" + word
    if word in neg:
      n = not n
    if word in d:
      words_used.add(word)
  #print(words_used)
  ans = []
  for c in C:
    sum = prior[c]
    for word in words_used:
      if word in l:
        sum *= (d[word][c]+1)/den_likelihood[c]
    ans.append(sum)
  ans = np.array(ans)
  #print(line_name, ans)
  #print(C[ans.argmax()])
  return C[ans.argmax()]

In [19]:
#Loading the actual sentiment of the lines
dir_path = "/content/drive/My Drive/Problem_Statements/datasets/target_labels.csv"
df_sent = pd.read_csv(dir_path)
print(df_sent)
df_sent.insert(2,"predicted_sentiment",[predict(line_name) for line_name in df_sent['file']])
print(df_sent)

         file sentiment
0  line_1.png     Angry
1  line_2.png     Angry
2  line_3.png     Happy
3  line_4.png     Happy
4  line_5.png   Neutral
5  line_6.png   Neutral
         file sentiment predicted_sentiment
0  line_1.png     Angry               Angry
1  line_2.png     Angry               Angry
2  line_3.png     Happy               Happy
3  line_4.png     Happy               Happy
4  line_5.png   Neutral             Neutral
5  line_6.png   Neutral             Neutral


In [20]:
accuracy = (df_sent['sentiment'] == df_sent['predicted_sentiment']).sum()/len(df_sent)
print(accuracy*100,"%")

100.0 %


Even though the predicted lines are not that accurate, Naive Bayes Method is able to predict the sentiment of the line correctly.

* Tried adding a token of NOT_ at the beginning of the word after an occurrence of negative word in the sentence.
* We should stop adding NOT_ if we encounter any punctuation or another negative word.
* Since it is a single line, we stop adding only when another negative word encountered.

Adding this token didn't affect the final results in this case but this will be a good practice. I believe if any other dataset was given, the result may vary according to addition of token. Since, it will separate the negative meaning positive words and it will totally deny the existence of that positive word in the sentence.
