In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
import math 

In [None]:
train_df = pd.read_csv("/content/drive/MyDrive/cs541/swad_train.csv")
train_text = train_df["Tweet"].to_numpy()
train_labels = train_df["Label"].to_numpy()

In [None]:
test_df = pd.read_csv("/content/drive/MyDrive/cs541/swad_test.csv")
test_text = train_df["Tweet"].to_numpy()
test_labels = train_df["Label"].to_numpy()

In [None]:
def get_stopwords():
    words = set()
    with open('/content/drive/MyDrive/cs541/stopwords.txt') as f:
        lines = f.read()
        line_list=lines.split("\n")
        for word in line_list:
            words.add(word)
    return words
stop_words = get_stopwords()

In [None]:
def get_punctuation():
    puncts = set()
    with open("/content/drive/MyDrive/cs541/punctuations.txt") as f:
        lines = f.read()
        line_list=lines.split("\n")
        for mark in line_list:
            puncts.add(mark)
    return puncts
punctuations = get_punctuation()

In [None]:
def pre_process(corpus, stop_words, punctuations):
  text = []
  for line in corpus:
    tokens = []
    for word in line.split(" "):
      if word not in stop_words:
        if word in punctuations:
          tokens.append(" ")
          tokens.append(word)
          tokens.append(" ")
        else:
          tokens.append(word.lower())
      
    string = " ".join(tokens)
    text.append(string)
  return text 

In [None]:
train_corpus = pre_process(train_text, stop_words, punctuations)
test_corpus = pre_process(test_text, stop_words, punctuations)

In [None]:
def label_encoding(categorical_labels):
  return [0 if label == "No" else 1 for label in categorical_labels] 

In [None]:
from numpy import matrix
import collections
import math
class MyVectorizer():
  def __init__(self):
    self.unique_idx = {}
    self.doc_freq = collections.defaultdict(int)
    self.unique_words = set()

  def get_unique_words(self, corpus):
    uniq = set()
    for line in corpus:
      for word in line.split(" "):
        uniq.add(word.lower())
    return uniq

  def idf(self, doc_freq, word, no_docs): #???
    idf_value = math.log( (1 + no_docs) / (1 + doc_freq[word])) + 1
    return idf_value


  def fit_transform(self, corpus):
    self.unique_words = self.get_unique_words(corpus)
    # unique_idx = {}
    for idx, word in enumerate(self.unique_words):
      self.unique_idx[word] = idx

    matrix = [[0 for _ in range(len(self.unique_words))] for _ in range(len(corpus))]
    # doc_freq = collections.defaultdict(int)

    for i, line in enumerate(corpus):
      for j, word in enumerate(line.split(" ")):
        matrix[i][self.unique_idx[word]] += 1
        self.doc_freq[word] += 1
    
    for i, line in enumerate(corpus):
      for j, word in enumerate(line.split(" ")):
        matrix[i][self.unique_idx[word]] = self.idf(self.doc_freq, word, len(corpus)) * matrix[i][self.unique_idx[word]]
                                                                           
    return self.l2_normalize(matrix)
  
  def l2_normalize(self, vectors):
    """
    return - vector or same size
    """
    vect_normalised = [[0 for _ in range(len(vectors[0]))] for _ in range(len(vectors))]
    for i, vec in enumerate(vectors):
      vec_mag = math.sqrt(sum([x*x for x in vec]))
      vect_normalised[i] = [x/vec_mag for x in vec]
    return vect_normalised
  
  def transform(self, test_corpus):
    test_matrix = [[0 for _ in range(len(self.unique_words))] for _ in range(len(test_corpus))]
    for i, line in enumerate(test_corpus):
      for j, word in enumerate(line.split(" ")):
        if word in self.doc_freq and word in self.unique_idx:
          test_matrix[i][self.unique_idx[word]] = self.idf(self.doc_freq, word, len(test_corpus)) * self.doc_freq[word]
    return self.l2_normalize(test_matrix)

  
      

In [None]:
vectorizer = MyVectorizer()
x_train = vectorizer.fit_transform(train_corpus)
y_train = label_encoding(train_labels)
X_train = np.array(x_train)
y_train = np.array(y_train)

In [None]:
x_test = vectorizer.transform(test_corpus)
y_test = label_encoding(test_labels)
X_test = np.array(x_test)
y_test = np.array(y_test)

In [None]:
class LR_Model2():
  def __init__(self):
    self.W = 0
  
  def weight_updates(self, lr):
    A = 1 / ( 1 + np.exp( - ( self.X.dot( self.W ) + self.b ) ) )
    A1 = (A- self.y.T)
    A1 = np.reshape(A1, self.n_samples)
    dW = np.dot(self.X.T, A1) / self.n_samples
    dB = np.sum(A1) / self.n_features

    self.W = self.W  - lr*dW
    self.b = self.b - lr*dB
    return

  def fit_train(self, X, y, epochs, lr):
    self.n_samples, self.n_features = X.shape
    self.W = np.zeros(self.n_features)
    self.b = 0
    self.X = X
    self.y = y

    for i in range(epochs):
      self.weight_updates(lr)
    return

  def pred(self, X):
    Z = 1 / ( 1 + np.exp( - ( X.dot( self.W ) + self.b ) ) )
    Yhat = np.where( Z > 0.5, 1, 0 )
    return Yhat

In [None]:
lr_model = LR_Model2()
lr_model.fit_train(X_train, y_train, 20, 0.01)

In [None]:
# Precit on test data
y_pred =  lr_model.pred(X_test)
corr_cnt = 0
for i in range(np.size(y_pred)):
  if y_pred[i] == y_test[i]:
    corr_cnt += 1
acc = (corr_cnt / len(y_test)) * 100
print(f"Correctly predicted: {corr_cnt}")
print(f"Accuracy is: {acc}")


Correctly predicted: 2307
Accuracy is: 64.27974366118696
