In [None]:
# pandas to open data files & processing it.
import pandas as pd
# to see all columns
pd.set_option('display.max_columns', None)
# To see whole text
pd.set_option('max_colwidth', -1)

# numpy for numeric data processing
import numpy as np

# keras for deep learning model creation
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Flatten, Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras.utils import plot_model

# to fix random seeds
import random
import tensorflow as tf
import torch
import os

# Regular Expression for text cleaning
import re

# to track the progress - progress bar
from tqdm.notebook import tqdm

In [None]:
df = pd.read_csv('file:///Users/DeLaLuna/Downloads/train-balanced-sarcasm.csv')


In [None]:
df.drop(['author', 'subreddit', 'score', 'ups', 'downs', 'date', 'created_utc', 'parent_comment'], axis=1, inplace=True)

df.dropna(inplace=True)
df.head()

In [6]:

label_counts = df['label'].value_counts()
print(label_counts)


0    505405
1    505368
Name: label, dtype: int64


In [7]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1010773 entries, 0 to 1010825
Data columns (total 2 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   label    1010773 non-null  int64 
 1   comment  1010773 non-null  object
dtypes: int64(1), object(1)
memory usage: 23.1+ MB


In [8]:
mispell_dict = {"ain't": "is not", "cannot": "can not", "aren't": "are not", "can't": "can not", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",
                "doesn't": "does not",
                "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'll": "he will", "he's": "he is", "how'd": "how did",
                "how'd'y": "how do you", "how'll": "how will", "how's": "how is", "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have", "I'm": "I am",
                "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will", "i'll've": "i will have", "i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
                "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have", "it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have",
                "mightn't": "might not", "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not",
                "needn't've": "need not have", "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not",
                "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",
                "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have", "so's": "so as", "this's": "this is", "that'd": "that would",
                "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is", "they'd": "they would",
                "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not",
                "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not",
                "what'll": "what will", "what'll've": "what will have", "what're": "what are", "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have",
                "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
                "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "wont": "will not", "won't've": "will not have", "would've": "would have",
                "wouldn't": "would not",
                "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would", "y'all'd've": "you all would have", "y'all're": "you all are", "y'all've": "you all have",
                "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have", 'colour': 'color',
                'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor',
                'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What',
                'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I',
                'theBest': 'the best', 'howdoes': 'how does', 'Etherium': 'Ethereum',
                'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what',
                'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization'}

mispell_dict = {k.lower(): v.lower() for k, v in mispell_dict.items()}

In [9]:
def preprocessing_text(s):
    # making our string lowercase & removing extra spaces
    s = str(s).lower().strip()
    
    # remove contractions.
    s = " ".join([mispell_dict[word] if word in mispell_dict.keys() else word for word in s.split()])
    
    # removing \n
    s = re.sub('\n', '', s)
    
    # put spaces before & after punctuations to make words seprate. Like "king?" to "king", "?".
    s = re.sub(r"([?!,+=—&%\'\";:¿।।।|\(\){}\[\]//])", r" \1 ", s)
    
    # Remove more than 2 continues spaces with 1 space.
    s = re.sub('[ ]{2,}', ' ', s).strip()
    
    return s

In [10]:
df['comment'] = df['comment'].apply(preprocessing_text)

In [11]:
# Step 1: Concatenate all text into a single string
text_combined = df['comment'].str.cat(sep=' ')

# Step 2: Tokenize the combined text string into individual words
words_seperated = text_combined.split()

# Step 3: Count the number of unique words
unique_word_count = len(set(words_seperated))

print("Number of unique words:", unique_word_count)


Number of unique words: 271808


In [21]:
# total unique words we are going to use.
TOTAL_WORDS = 271808

# max number of words one sentence can have
MAX_LEN = 50

# width of of 1D embedding vector
EMBEDDING_SIZE = 300

In [13]:
%%time
tokenizer = Tokenizer(num_words=TOTAL_WORDS)
tokenizer.fit_on_texts(list(df['comment']))

train_data = tokenizer.texts_to_sequences(df['comment'])
train_data = pad_sequences(train_data, maxlen = MAX_LEN)
target = df['label']

CPU times: user 40.4 s, sys: 493 ms, total: 40.9 s
Wall time: 41.2 s


In [33]:
#########%%time
###EMBEDDING_FILE = '/Users/DeLaLuna/Downloads/fasttext-crawl-300d-2m/crawl-300d-2M.vec'


def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in tqdm(open(EMBEDDING_FILE)))

word_index = tokenizer.word_index
nb_words = min(TOTAL_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words, EMBEDDING_SIZE))

FileNotFoundError: [Errno 2] No such file or directory: '/Users/DeLaLuna/Downloads/fasttext-crawl-300d-2m/crawl-300d-2M.vec'

In [14]:
import fasttext


In [35]:
##########EMBEDDING_FILE = '/Users/DeLaLuna/Downloads/fasttext-crawl-300d-2m/crawl-300d-2M.vec'

# Define the function to get word coefficients from the embedding file
###def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

# Load the embedding file and create the embeddings_index dictionary
with open(EMBEDDING_FILE, 'r') as file:
    embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in tqdm(file))

# Get the word index from the tokenizer
word_index = tokenizer.word_index

# Determine the number of words and size of the embedding matrix
nb_words = min(TOTAL_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words, EMBEDDING_SIZE))


FileNotFoundError: [Errno 2] No such file or directory: '/Users/DeLaLuna/Downloads/fasttext-crawl-300d-2m/crawl-300d-2M.vec'

In [1]:
########import io

####def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = map(float, tokens[1:])
    return data


In [15]:
from tqdm import tqdm

unique_words = set()

# Iterate over each comment and split into words
for comment in tqdm(df['comment'], desc='Loading unique words', unit=' comments'):
    words = comment.split()
    unique_words.update(words)

# Display the total number of unique words
print("Total unique words:", len(unique_words))


Loading unique words: 100%|█| 1010773/1010773 [00:02<00:00, 385426.82 comments/s

Total unique words: 271808





In [16]:
unique_words

{'fullest.',
 'crashed..',
 'freetime.',
 'hotfixing',
 'xeons.',
 'loooolll',
 'weefee',
 'moderate-carb',
 'non-creationists',
 'blinding',
 'no-contact',
 '*sexist.',
 'money...hmm...',
 'felwort',
 'inproving',
 'tasered',
 'maaaagic',
 'marco_rubio',
 'pen**.',
 'bun',
 'byak',
 'f14-t',
 'oled',
 'sh*t',
 'cordarrelle',
 'breaking*',
 'monologue',
 '15626',
 'braves.',
 'super7',
 'xbox...then',
 'reconnects',
 'sophomore',
 'shmenocide.',
 'heterosexual.',
 'gulliani',
 'statefarm.',
 'knaus.',
 'duchene.',
 'tcs',
 'holoflag',
 'obsessing',
 'momz',
 'wavelike',
 'everybody.',
 'rondae',
 'turned*',
 'hose.',
 'leppa',
 'hurricane',
 '*muslim*.',
 'borowiecki',
 '00001011',
 'cig',
 'easterns',
 'jason..',
 'tastiera',
 'infp.',
 'mono-red',
 'dui.',
 'saints.',
 'online...',
 'ricola',
 'luisteren.',
 '*child*',
 'mencionan',
 'connects...',
 'outlandishly',
 'tom_robinson',
 'upskirts',
 'lionhead',
 'garrus.',
 'jetter',
 'non-cryptocurrency',
 'fps...i',
 '*downvotes',
 'sw

In [18]:

import gensim.downloader as api
model = api.load('word2vec-google-news-300')


In [17]:
# Process the 'comment' column in your custom DataFrame
comments = df['comment'].tolist()

# Example: Find similar words to a given word
similar_words = model.most_similar('king')
print(similar_words)

# Example: Perform vector arithmetic
result = model.most_similar(positive=['king', 'woman'], negative=['man'])
print(result)

# Example: Access the vector representation of a word
vector = model['word']
print(vector)

[('kings', 0.7138046622276306), ('queen', 0.6510956287384033), ('monarch', 0.6413194537162781), ('crown_prince', 0.6204220056533813), ('prince', 0.6159993410110474), ('sultan', 0.5864824056625366), ('ruler', 0.5797567367553711), ('princes', 0.5646552443504333), ('Prince_Paras', 0.543294370174408), ('throne', 0.5422105193138123)]
[('queen', 0.7118191719055176), ('monarch', 0.6189674735069275), ('princess', 0.5902431011199951), ('crown_prince', 0.5499460697174072), ('prince', 0.5377321839332581), ('kings', 0.5236844420433044), ('Queen_Consort', 0.5235945582389832), ('queens', 0.5181134343147278), ('sultan', 0.5098593831062317), ('monarchy', 0.5087412595748901)]
[ 3.59375000e-01  4.15039062e-02  9.03320312e-02  5.46875000e-02
 -1.47460938e-01  4.76074219e-02 -8.49609375e-02 -2.04101562e-01
  3.10546875e-01 -1.05590820e-02 -6.15234375e-02 -1.55273438e-01
 -1.52343750e-01  8.54492188e-02 -2.70996094e-02  3.84765625e-01
  4.78515625e-02  2.58789062e-02  4.49218750e-02 -2.79296875e-01
  9.094

In [None]:
####unique_words = set(df['comment'].str.split().sum())  # Assuming 'comment' is the column containing the text


In [19]:
from tqdm import tqdm

oov_words = []

# Iterate over each word in unique_words and check if it is not present in the model
for word in tqdm(unique_words, desc='Processing words'):
    if word not in model:
        oov_words.append(word)

# Display the total number of OOV words
print("Total OOV words:", len(oov_words))


Processing words: 100%|██████████████| 271808/271808 [00:02<00:00, 92182.17it/s]

Total OOV words: 206852





In [18]:
#vectorise the comments
###comment_vectors = [model[comment_word] for comment_word in comments]


KeyError: "Key 'nc and nh.' not present"

In [23]:
###########import numpy as np

# Determine the vocab_size and embedding_dim
vocab_size = (TOTAL_WORDS + 1)  # Add 1 for the OOV token
embedding_dim = 300  # Example dimensionality, replace with your desired value

# Create the embedding matrix
embedding_matrix = np.random.rand(vocab_size, embedding_dim)


In [25]:
#############for i, word in enumerate(oov_words):
    embedding_matrix[word_index[word]] = np.random.rand(embedding_dim + 1)


NameError: name 'word_index' is not defined

In [26]:
from keras.preprocessing.text import Tokenizer

# Initialize the tokenizer
tokenizer = Tokenizer(num_words=TOTAL_WORDS)
tokenizer.fit_on_texts(list(df['comment']))

# Create the word_index dictionary
word_index = tokenizer.word_index

# Print the number of unique words in the tokenizer
vocab_size = len(word_index)
print("Vocabulary size:", vocab_size)


Vocabulary size: 166508


In [27]:
import numpy as np

# Create the embedding matrix
embedding_matrix = np.random.rand(vocab_size + 1, embedding_dim)

# Initialize the OOV word vectors
for word, i in word_index.items():
    if word in oov_words:
        embedding_matrix[i] = np.random.rand(embedding_dim)


In [29]:
max_length = MAX_LEN


In [28]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

# Define the model
model = Sequential()
model.add(Embedding(vocab_size + 1, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=True))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

# Compile and train the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_data, target, batch_size=32, epochs=10, validation_split=0.2)


NameError: name 'max_length' is not defined

In [31]:
import numpy as np
from tqdm import tqdm


# Define the model architecture and set the embedding layer with the embedding matrix
model = Sequential()
model.add(Embedding(vocab_size + 1, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=True))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))


In [None]:
from tqdm import tqdm

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define the number of epochs
epochs = 10
batch_size = 32

# Train the model with tqdm progress bar
for epoch in tqdm(range(epochs)):
    # Shuffle the training data
    indices = np.arange(len(train_data))
    np.random.shuffle(indices)
    shuffled_train_data = train_data[indices]
    shuffled_target = target.iloc[indices]  # Corrected indexing for target
    
    # Iterate over mini-batches and update the model
    for i in range(0, len(train_data), batch_size):
        batch_train_data = shuffled_train_data[i:i+batch_size]
        batch_target = shuffled_target[i:i+batch_size]
        
        # Train on the batch
        model.train_on_batch(batch_train_data, batch_target)

# Evaluate the model
loss, accuracy = model.evaluate(train_data, target)
print('Loss:', loss)
print('Accuracy:', accuracy)


  0%|                                                    | 0/10 [00:00<?, ?it/s]