<a href="https://colab.research.google.com/github/YagyanshB/SemEval-Task6-CS408/blob/main/Logistic_Regression_Baseline_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Logistic Regression :

---
For our exploratory venture, we have implemented Logistic Regression as one of our **Baseline Models**. The implementation has been provided below.

It is strongly recommended to follow the steps as outlined apart from when uploading files to your local environment or directory. Please ensure that the file paths have been named in the right manner.


# Importing Required Libraries:

---
For the purpose of proper execution, it is imperative to upload the right set of packages and modules.


In [1]:
import regex as re
import numpy as np
import pandas as pd
from numpy import array
from numpy import argmax
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import pickle
from pickle import *

# Importing the OLID 2019 Dataset:

---



In [2]:
# We mount our Google Drive within Google Colab. Since I have already uploaded my files on my Google Drive
# this task becomes fairly convenient for myself. If running the program, please be sure to mount the 
# dataset on your google drive as well. 

from google.colab import drive
drive.mount('/content/drive')

train_file = 'drive/My Drive/olid-training-v1.new.tsv'

test_file_a = 'drive/My Drive/testset-levela.tsv' 
test_labels_a = 'drive/My Drive/labels-levela.csv' 

test_labels_b = 'drive/My Drive/labels-levelb.csv' 
test_file_b = 'drive/My Drive/testset-levelb.tsv' 

test_file_c = 'drive/My Drive/testset-levelc.tsv' 
test_labels_c = 'drive/My Drive/labels-levelc.csv' 

# Within the Code below, we re run the code to ensure that our files have been
# mounted in the right manner. It is imperative to have the link to the directory
# sent correctly; else the files won't be loaded.

Mounted at /content/drive


# Implementation of Sub-Task A:

---



In [3]:
def read_lines(file):
  # We want to create a list of sentence from our OLID 2019 dataset. Each sentence should be a string.

  f = open(file).read().strip()
  sentences = f.split('\n\n')
  return sentences

In [4]:
def split_lines(lines, column_names):
    for line in lines:
      rows = line.split('\n')
      line = [dict(zip(column_names, row.split('\t'))) for row in rows]
    return line

In [5]:
def read_lines2(file):
    f = open(file).read().strip()
    return f

In [6]:
def split_lines2(lines, column_names):
    rows = lines.split('\n')
    line = [dict(zip(column_names, row.split(','))) for row in rows]

    return line 

In [7]:
def one_hot_encode(list_of_tweets, token_index):
    max_length = len(token_index)
    results = np.zeros(shape = (len(list_of_tweets), max(token_index.values()) +1))

    for i, tweet in enumerate(list_of_tweets):
        for word in tweet.split():
            index = token_index.get(word)
            results[i, index] = 1.
    return results

In [8]:
def to_one_hot(labels, token_index):
    results = np.zeros((len(labels), max(token_index.values()) +1))
    for i, label in enumerate(labels):
      index = token_index.get(label)
      results[i, index] = 1.
    return results

In [9]:
train_file = 'drive/My Drive/olid-training-v1.new.tsv'

test_file = 'drive/My Drive/testset-levela.tsv' 
test_file_b = 'drive/My Drive/testset-levelb.tsv' 
test_file_c = 'drive/My Drive/testset-levelc.tsv' 

test_labels_a = 'drive/My Drive/labels-levela.csv' 
test_labels_b = 'drive/My Drive/labels-levelb.csv' 
test_labels_c = 'drive/My Drive/labels-levelc.csv' 

In [10]:
column_names = ['id', 'tweet', 'subtask_a', 'subtask_b', 'subtask_c']
column_names_test = ['id', 'tweet']

In [11]:
lines = read_lines(train_file)
formatted_corpus = split_lines(lines, column_names)
formatted_corpus = formatted_corpus[1:]

In [12]:
list_of_tweets = []
y_a = []
y_b = []
y_c = []

list_of_off_tweets = []
list_of_tin_tweets = []

In [13]:
label_index = {}

In [14]:
for tweet in formatted_corpus:
    sentence = tweet['tweet'].lower()
    list_of_tweets.append(sentence)
    if tweet['subtask_a'] == 'OFF':
        y_a.append(1)
        list_of_off_tweets.append(sentence)
        if tweet['subtask_b'] == 'UNT':
            y_b.append(0)
        elif tweet['subtask_b'] == 'TIN':
            y_b.append(1)
            list_of_tin_tweets.append(sentence)
            if tweet['subtask_c'] == 'GRP':
                y_c.append('GRP')
            elif tweet['subtask_c'] == 'IND':
                y_c.append('IND')
            elif tweet['subtask_c'] == 'OTH':
                y_c.append('OTH')
            if tweet['subtask_c'] not in label_index:
                label_index[tweet['subtask_c']] = len(label_index)
    else:
        y_a.append(0)

In [15]:
token_index = {}
for tweet in list_of_tweets:
    for word in tweet.split():
        if word not in token_index:
            token_index[word] = len(token_index)

In [16]:
lines = read_lines(test_file)
formatted_corpus_test_a = split_lines(lines, column_names_test)
formatted_corpus_test_a = formatted_corpus_test_a[1:]

In [17]:
lines = read_lines2(test_labels_a)
formatted_corpus_labels_a = split_lines2(lines,['id', 'label'])

In [18]:
list_of_tweets_test_a = []
y_true_a = []

In [19]:
for tweet in formatted_corpus_test_a:
    sentence = tweet['tweet'].lower()
    list_of_tweets_test_a.append(sentence)

In [20]:
for tweet in list_of_tweets_test_a:
    for word in tweet.split():
        if word not in token_index:
            token_index[word] = len(token_index)

In [21]:
for tweet in formatted_corpus_labels_a:
    if tweet['label'] == 'NOT':
        y_true_a.append(0)
    else:
        y_true_a.append(1)

In [22]:
X_a = one_hot_encode(list_of_tweets, token_index)
X_test_a = one_hot_encode(list_of_tweets_test_a, token_index)

In [23]:
classifier = LogisticRegression(penalty = 'l2', dual = True, solver = 'liblinear', verbose = 1)

In [24]:
model = classifier.fit(X_a, y_a)
y_pred_a = classifier.predict(X_test_a)

[LibLinear]

In [74]:
print("Classification Report for Sub-Task A:")
print(metrics.classification_report(y_true_a, y_pred_a, target_names = ['NOT', 'OFF'], digits = 4))

Classification Report for Sub-Task A:
              precision    recall  f1-score   support

         NOT     0.7972    0.9258    0.8567       620
         OFF     0.6714    0.3917    0.4947       240

    accuracy                         0.7767       860
   macro avg     0.7343    0.6587    0.6757       860
weighted avg     0.7621    0.7767    0.7557       860



In [26]:
print("Confusion Matrix for Subtask A:")
print(metrics.confusion_matrix(y_true_a, y_pred_a))

Confusion Matrix for Subtask A:
[[574  46]
 [146  94]]


In [27]:
!pip install PrettyTable



In [28]:
from prettytable import PrettyTable
x = PrettyTable()

x.field_names = ["Method", "Macro F1-Score", "Accuraccy"]
x.add_row(["Logistic Regression", "0.6757","0.7767"])

print("Results for Subtask A:")
print(x)

Results for Subtask A:
+---------------------+----------------+-----------+
|        Method       | Macro F1-Score | Accuraccy |
+---------------------+----------------+-----------+
| Logistic Regression |     0.6757     |   0.7767  |
+---------------------+----------------+-----------+


# Implementation of Sub-Task B:

---



In [29]:
lines = read_lines(test_file_b)
formatted_corpus_test_b = split_lines(lines, column_names_test)
formatted_corpus_test_b = formatted_corpus_test_b[1:]

In [30]:
lines = read_lines2(test_labels_b)
formatted_corpus_labels_b = split_lines2(lines, ['id', 'label'])

In [31]:
list_of_tweets_test_b = []
y_true_b = []

In [32]:
for tweet in formatted_corpus_test_b:
    sentence = tweet['tweet'].lower()
    list_of_tweets_test_b.append(sentence)

In [33]:
for tweet in list_of_tweets_test_b:
    for word in tweet.split():
        if word not in token_index:
           token_index[word] = len(token_index)

In [34]:
for tweet in formatted_corpus_labels_b:
    if tweet['label'] == 'UNT':
       y_true_b.append(0)
    elif tweet['label'] == 'TIN':
       y_true_b.append(1)

In [35]:
X_b = one_hot_encode(list_of_tweets, token_index)
X_test_b = one_hot_encode(list_of_tweets_test_b, token_index)

In [36]:
import pandas as pd

df = pd.read_csv('drive/My Drive/olid-training-v1.new.tsv', sep = '\t', nrows=30000)
df['subtask_b'].value_counts()

TIN    8299
UNT    4941
Name: subtask_b, dtype: int64

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [38]:
vectorizer = TfidfVectorizer(max_features=1000)
vectors = vectorizer.fit_transform(df.tweet)
words_df = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names())
words_df.head()

Unnamed: 0,10,100,15,20,2020,2a,2nd,30,able,abortion,about,absolutely,abuse,account,accuser,act,actions,actual,actually,after,again,against,agenda,ago,agree,ain,all,allegations,allow,allowed,almost,alone,along,already,also,alt,always,am,amazing,amendment,...,year,years,yes,yet,you,young,your,yourself,zero,äç,äô,äôd,äôll,äôm,äôre,äôs,äôt,äôve,äù,ôäô,ôçô,üá,üèª,üèº,üëä,üëç,üëè,üí,üíï,üíú,üî,üò,üòä,üòå,üòç,üòé,üòò,üòü,üôè,üôñ
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.098538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.238527,0.0,0.0,0.0,0.0,0.0,0.0,0.452324,0.0,0.0,0.616912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.377564,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
X = words_df
y = df['subtask_b']

In [40]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

In [41]:
%%time
# Create and train a logistic regression
logreg = LogisticRegression(C=1e9, solver='lbfgs', max_iter= 1000)
logreg.fit(X, y)

CPU times: user 9.97 s, sys: 2.15 s, total: 12.1 s
Wall time: 3.15 s


In [42]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [43]:
print("Training logistic regression")
logreg.fit(X_train, y_train)

Training logistic regression


LogisticRegression(C=1000000000.0, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [44]:
from sklearn.metrics import confusion_matrix

In [45]:
y_true = y_test
y_pred = logreg.predict(X_test)
matrix = confusion_matrix(y_true, y_pred)

label_names = pd.Series(['TIN', 'UNT'])
pd.DataFrame(matrix,
     columns='Predicted ' + label_names,
     index='Is ' + label_names)

Unnamed: 0,Predicted TIN,Predicted UNT
Is TIN,1630,450
Is UNT,892,338


In [46]:
y_true = y_test
y_pred = logreg.predict(X_test)
matrix = confusion_matrix(y_true, y_pred)

label_names = pd.Series(['TIN', 'UNT'])
pd.DataFrame(matrix,
     columns='Predicted ' + label_names,
     index='Is ' + label_names).div(matrix.sum(axis=1), axis=0)


Unnamed: 0,Predicted TIN,Predicted UNT
Is TIN,0.783654,0.216346
Is UNT,0.725203,0.274797


In [47]:
import sklearn.metrics as metrics

In [71]:
print("Classification Report for Sub-Task B:")
print(metrics.classification_report(y_true, y_pred))

Classification Report for Sub-Task B:
              precision    recall  f1-score   support

         TIN       0.65      0.78      0.71      2080
         UNT       0.43      0.27      0.33      1230

    accuracy                           0.59      3310
   macro avg       0.54      0.53      0.52      3310
weighted avg       0.57      0.59      0.57      3310



In [52]:
from prettytable import PrettyTable

y = PrettyTable()
y.field_names = ['Method', 'Macro F1-Score', 'Accuraccy']
y.add_row(["Logistic Regression", "0.52", "0.59"])

In [53]:
print('Results for Subtask B:')
print(y)

Results for Subtask B:
+---------------------+----------------+-----------+
|        Method       | Macro F1-Score | Accuraccy |
+---------------------+----------------+-----------+
| Logistic Regression |      0.52      |    0.59   |
+---------------------+----------------+-----------+


# Implementation of Sub-Task C:

---



In [54]:
lines = read_lines(test_file_c)
formatted_corpus_test_c = split_lines(lines, column_names_test)
formatted_corpus_test_c = formatted_corpus_test_c[1:]

In [55]:
lines = read_lines2(test_labels_c)
formatted_corpus_labels = split_lines2(lines,['id', 'label'])

list_of_tweets_test_c = []
y_true_c = []

In [56]:
for tweet in formatted_corpus_test_c:
    sentence = tweet['tweet'].lower()
    list_of_tweets_test_c.append(sentence)

In [57]:
for tweet in list_of_tweets_test_c:
    for word in tweet.split():
        if word not in token_index:
            token_index[word]= len(token_index)

In [59]:
for tweet in formatted_corpus_labels:
    if tweet['label']=='GRP':
        y_true_c.append('GRP')
    elif tweet['label']=='IND':
        y_true_c.append('IND')
    elif tweet['label']=='OTH':
        y_true_c.append('OTH')

In [60]:
one_hot_train_lables = to_one_hot(y_c, label_index)
one_hot_test_labels = to_one_hot(y_true_c, label_index)

In [61]:
X_c = one_hot_encode(list_of_tin_tweets, token_index)
X_test_c = one_hot_encode(list_of_tweets_test_c, token_index)

In [68]:
model = classifier.fit(X_c, y_c)
y_pred_c = classifier.predict(X_test_c)

[LibLinear]

In [67]:
print("Classification Report for Sub-Task C:")
print(metrics.classification_report(y_true_c, y_pred_c, digits = 4))

Classification Report for Sub-Task C:
              precision    recall  f1-score   support

         GRP     0.6533    0.6282    0.6405        78
         IND     0.6471    0.8800    0.7458       100
         OTH     0.5000    0.0286    0.0541        35

    accuracy                         0.6479       213
   macro avg     0.6001    0.5123    0.4801       213
weighted avg     0.6252    0.6479    0.5936       213



In [69]:
print("Confusion Matrix for Sub-Task C")
print(metrics.confusion_matrix(y_true_c, y_pred_c))

Confusion Matrix for Sub-Task C
[[49 28  1]
 [12 88  0]
 [14 20  1]]


In [73]:
from prettytable import PrettyTable

z = PrettyTable()
z.field_names = ['Method', 'Macro F1-Score', 'Accuraccy']
z.add_row(["Logistic Regression", "0.48", "0.64"])

print(z)

+---------------------+----------------+-----------+
|        Method       | Macro F1-Score | Accuraccy |
+---------------------+----------------+-----------+
| Logistic Regression |      0.48      |    0.64   |
+---------------------+----------------+-----------+
