# Install and import libraries

In [1]:
!pip install --upgrade nltk



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import re
import string

# Load Data

In [2]:
data_df = pd.read_json('/content/drive/MyDrive/NLP/CA2_Codes/sarcasm.json', lines=True)

In [3]:
data_df.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [4]:
data_df['is_sarcastic'].value_counts()

0    14985
1    13634
Name: is_sarcastic, dtype: int64

# Load Glove

In [6]:
!wget https://nlp.stanford.edu/data/glove.6B.zip

--2024-04-02 23:16:57--  https://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-04-02 23:16:57--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2024-04-02 23:19:37 (5.16 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]



In [7]:
!unzip glove.6B.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [5]:
Dim = 200

In [6]:
words = dict()
filename = f'glove.6B.{Dim}d.txt'

with open(filename, 'r') as f:
  for line in f.readlines():
    line = line.split(' ')

    try:
      words[line[0]] = np.array(line[1:], dtype=float)
    except:
      continue

In [7]:
len(words)

400000

# Preprocess Data and Create Embedding

In [8]:
import nltk

nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
X, y = data_df.headline.to_list(), data_df.is_sarcastic.to_numpy()

In [10]:
def preprocessing(text):
    # make text lowercase
    text = str(text).lower()

    # remove punctuations
    english_punctuations = string.punctuation
    translator = str.maketrans('', '', english_punctuations)
    return text.translate(translator)

In [11]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def message_to_token_list(s):
  tokens = word_tokenize(s)
  lemmatized_tokens = [lemmatizer.lemmatize(t) for t in tokens]
  useful_tokens = [t for t in lemmatized_tokens if t in words]

  return useful_tokens

In [12]:
# create embedding matrix
def message_to_word_vectors(message, word_dict=words):
  tokens = message_to_token_list(message)

  vectors = []

  for token in tokens:
    if token not in word_dict:
      continue

    vectors.append(word_dict[token])

  return np.array(vectors, dtype=float)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
X_train = [preprocessing(text) for text in X_train]
X_train = [message_to_word_vectors(text) for text in X_train]

X_test = [preprocessing(text) for text in X_test]
X_test = [message_to_word_vectors(text) for text in X_test]

In [15]:
from copy import deepcopy

# add apdding where is needed
def pad_X(X, desired_sequence_length=50):
  X_copy = deepcopy(X)

  for i, x in enumerate(X):
    x_seq_len = x.shape[0]
    if x_seq_len>desired_sequence_length:
        X_copy[i] = x[:desired_sequence_length]
        continue

    sequence_length_difference = desired_sequence_length - x_seq_len

    pad = np.zeros(shape=(sequence_length_difference, Dim))

    if x_seq_len != 0:
      X_copy[i] = np.concatenate([x, pad])
    else:
      X_copy[i] = pad

  return np.array(X_copy).astype(float)

In [17]:
length = 50
X_train = pad_X(X_train, length)
X_test = pad_X(X_test)

X_train.shape

(22895, 50, 200)

In [19]:
d2_X_train = X_train.reshape((X_train.shape[0],length*Dim))
d2_X_test = X_test.reshape((X_test.shape[0],length*Dim))

# Create and Train Model

In [20]:
clf = LogisticRegression(random_state=0).fit(d2_X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
y_pred = clf.predict(d2_X_test)

In [22]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.78      0.78      2995
           1       0.76      0.76      0.76      2729

    accuracy                           0.77      5724
   macro avg       0.77      0.77      0.77      5724
weighted avg       0.77      0.77      0.77      5724

