# Import packages and Libraries

In [1]:
import os
import time
import pandas as pd
import numpy as np
import pickle as pkl

import re
import nltk
import string
from nltk.tokenize.simple import *
from nltk.corpus import stopwords
from itertools import groupby

from sklearn import model_selection, preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.utils import class_weight
# from sklearn.metrics import accuracy_score, confusion_matrix, cohen_kappa_score, precision_score, matthews_corrcoef, roc_auc_score, balanced_accuracy_score, recall_score, f1_score, make_scorer

import fasttext
import fasttext.util

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer


from tqdm import tqdm
tqdm().pandas()

import warnings
warnings.filterwarnings('ignore')
stop = stopwords.words('english')

!wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M-subword.zip
!unzip crawl-300d-2M-subword.zip
pretrained = fasttext.FastText.load_model('crawl-300d-2M-subword.bin')

0it [00:00, ?it/s]


--2023-02-28 18:52:52--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M-subword.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.75.142, 172.67.9.4, 104.22.74.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.75.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5828358084 (5.4G) [application/zip]
Saving to: ‘crawl-300d-2M-subword.zip’


2023-02-28 18:57:20 (20.8 MB/s) - ‘crawl-300d-2M-subword.zip’ saved [5828358084/5828358084]

Archive:  crawl-300d-2M-subword.zip
  inflating: crawl-300d-2M-subword.vec  
  inflating: crawl-300d-2M-subword.bin  




# Import data

In [2]:
data = pd.read_csv('../input/isarcasm-tweets-rephrase/iSarcasm.csv')
news = pd.read_json("../input/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset_v2.json", lines=True)

### Handeling Null values

In [3]:
data = data.dropna(subset=['tweet'])
data[['sarcasm','irony','satire','understatement','overstatement','rhetorical_question']] = data[['sarcasm','irony','satire','understatement','overstatement','rhetorical_question']].fillna(0)
data['rephrase'] = data['rephrase'].fillna('NA')

In [4]:
print('----------------------------------------------------------------------------------')
print('Tweet data')
print('----------------------------------------------------------------------------------')
print('there are ',data.loc[data.sarcastic == 0].shape[0],'tweets that are not sarcastic')
print('there are ',data.loc[data.sarcastic == 1].shape[0],'tweets that are sarcastic')
print('there are ',data.loc[data.sarcasm == 1].shape[0],'tweets with pure sarcasm')
print('there are ',data.loc[data.irony == 1].shape[0],'tweets that are ironic')
print('there are ',data.loc[data.satire == 1].shape[0],'tweets with satire')
print('there are ',data.loc[data.understatement == 1].shape[0],'tweets that are understatement')
print('there are ',data.loc[data.overstatement == 1].shape[0],'tweets that are overstatement')
print('there are ',data.loc[data.rhetorical_question == 1].shape[0],'tweets that are rhetorical question')
print('----------------------------------------------------------------------------------')
print('News Headlines data')
print('----------------------------------------------------------------------------------')
print('there are ',news.loc[news.is_sarcastic == 0].shape[0],'news that are not sarcastic')
print('there are ',news.loc[news.is_sarcastic == 1].shape[0],'news that are sarcastic')

----------------------------------------------------------------------------------
Tweet data
----------------------------------------------------------------------------------
there are  2600 tweets that are not sarcastic
there are  867 tweets that are sarcastic
there are  713 tweets with pure sarcasm
there are  155 tweets that are ironic
there are  25 tweets with satire
there are  10 tweets that are understatement
there are  40 tweets that are overstatement
there are  101 tweets that are rhetorical question
----------------------------------------------------------------------------------
News Headlines data
----------------------------------------------------------------------------------
there are  14985 news that are not sarcastic
there are  13634 news that are sarcastic


# Data cleansing

In [5]:
quotes = ['"', '“', '”',"‘", "’", "'"]
words_to_replace = {
                    "won't":"will not",
                    "can't":"can not",
                    }

def replace_words(x, dict):
    for word, replacement in dict.items():
        x = x.replace(word, replacement)
    return x

def remove_emoji(x):
    '''
    Function to remove emojis, symbols and pictograms etc from text
    @param text: (str) sentences 
    @return: (str) clean text 
    '''
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', x)

def pos_tagging(x):
    x = nltk.sent_tokenize(x)
    x = [nltk.word_tokenize(sent) for sent in x]
    x = [nltk.pos_tag(sent) for sent in x]
    return [num for sublist in x for num in sublist]

def extract_pos(x):
    return [sublist[1] for sublist in x]

def extract_pos_pattern(x):
    return " ".join([label for label, group in groupby(x)])

def chunking(x):
    l = []
    z = nltk.chunk.ne_chunk(x)
    l.append(z)
    return l

In [6]:
def data_cleansing(df,text_field):
    df['clean_data'] = df[text_field].str.lower()

    df['tags'] = df[text_field].apply(lambda x: ','.join(re.findall(r'(@[^\s]+)', str(x))))
    df['hashtags'] = df[text_field].apply(lambda x: ','.join(re.findall(r'(#[^\s]+)', str(x))))
    df['url'] = df[text_field].apply(lambda x: ','.join(re.findall(r'(https?://[^\s]+)', str(x))))

    df['clean_data'] = df['clean_data'].apply(lambda x: re.sub(r'(@[^\s]+)', '', str(x)))
    df['clean_data'] = df['clean_data'].apply(lambda x: re.sub(r'(#[^\s]+)', '', str(x)))
    df['clean_data'] = df['clean_data'].apply(lambda x: re.sub(r'(https?://[^\s]+)', '', str(x)))

    df['clean_data'] = df['clean_data'].apply(lambda x: replace_words(str(x), words_to_replace))
    df['clean_data'] = df['clean_data'].str.replace("n't"," not")
    df['clean_data'] = df['clean_data'].apply(lambda x: str(x).translate(str.maketrans('', '', string.punctuation)))
    df['clean_data'] = df['clean_data'].str.replace('|'.join(quotes),"")
    df['clean_data'] = df['clean_data'].apply(remove_emoji)
    df['clean_data_ws'] = df['clean_data'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

    df['pos_tagged_data'] = df['clean_data'].apply(lambda x: str(pos_tagging(x)).strip('[]'))
    df['pos'] = df['clean_data'].apply(lambda x: " ".join(map(str,extract_pos(pos_tagging(x)))))
    df['pos_pattern'] = df['clean_data'].apply(lambda x: extract_pos_pattern(extract_pos(pos_tagging(x))))
    df['chunked_data'] = df['clean_data'].apply(lambda x: chunking(pos_tagging(x)))
    return df


In [7]:
clean_news = data_cleansing(news[['headline']], 'headline')

In [8]:
clean_tweet = data_cleansing(data[['tweet']],'tweet')

In [9]:
tweet_df = clean_tweet[['clean_data','clean_data_ws','pos_tagged_data','pos']].join(data[['sarcastic','sarcasm','irony','satire','understatement','overstatement','rhetorical_question']])
news_df = clean_news[['clean_data','clean_data_ws','pos_tagged_data','pos']].join(news[['is_sarcastic']])

In [10]:
tweet_df.to_csv('tweet_data.csv')
news_df.to_csv('news_data.csv')

# Create vectors and embedded texts

In [11]:
def create_vectors_embeddings(df, TEXT, LABEL, prefix):
    train_x_sw, valid_x_sw, y_train_sw, y_valid_sw = model_selection.train_test_split(df[TEXT+'_ws'], df[LABEL], random_state=42, stratify=df[LABEL], test_size=0.2)

    # For Embeddings
    train_x, valid_x, y_train, y_valid = model_selection.train_test_split(df[TEXT], df[LABEL], random_state=42, stratify=df[LABEL], test_size=0.2)

    # label encode the target variable 
    encoder = preprocessing.LabelEncoder()
    train_y_sw = encoder.fit_transform(y_train_sw)
    valid_y_sw = encoder.fit_transform(y_valid_sw)
    train_y = encoder.fit_transform(y_train)
    valid_y = encoder.fit_transform(y_valid)

    class_weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train), y = y_train)
    print(*[f'Class weight: {round(i[0],4)}\tclass: {i[1]}' for i in zip(class_weights, np.unique(y_train))], sep='\n')

    # Determined if the dataset is balanced or imbalanced 
    ratio = np.min(df[LABEL].value_counts()) / np.max(df[LABEL].value_counts())
    if ratio > 0.1:      # Ratio 1:10 -> limite blanced / imbalanced 
        balanced = True
        print(f"\nThe dataset is balanced (ratio={round(ratio, 3)})")
    else:
        balanced = False
        print(f"\nThe dataset is imbalanced (ratio={round(ratio, 3)})")
        #from imblearn.over_sampling import ADASYN
        # put class for debalanced data 
        # in progress

    # Keep the unique label corresponding to their encoding correspondance
    labels = df[LABEL].unique()
    test=pd.DataFrame(data=np.transpose([labels,encoder.fit_transform(labels)]), columns=["labels", "encoding"]).sort_values(by=["encoding"])
    labels=test.labels.tolist()

    # create a count vectorizer object 
    count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
    count_vect.fit(df[TEXT]+"_sw")

    # transform the training and validation data using count vectorizer object
    xtrain_count =  count_vect.transform(train_x_sw)
    xvalid_count =  count_vect.transform(valid_x_sw)

    # word level tf-idf
    tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=10000)
    tfidf_vect.fit(df[TEXT])
    xtrain_tfidf =  tfidf_vect.transform(train_x_sw)
    xvalid_tfidf =  tfidf_vect.transform(valid_x_sw)
    print("word level tf-idf done")
    # ngram level tf-idf 
    tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=10000)
    tfidf_vect_ngram.fit(df[TEXT])
    xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x_sw)
    xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x_sw)
    print("ngram level tf-idf done")
    # characters level tf-idf
    tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char',  ngram_range=(2,3), max_features=10000) #token_pattern=r'\w{1,}',
    tfidf_vect_ngram_chars.fit(df[TEXT])
    xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x_sw) 
    xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x_sw) 
    print("characters level tf-idf done")

    # create a tokenizer 
    token = Tokenizer()
    token.fit_on_texts(df[TEXT])
    word_index = token.word_index

    # convert text to sequence of tokens and pad them to ensure equal length vectors 
    train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=300)
    valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), maxlen=300)

    # create token-embedding mapping
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    words = []
    for word, i in tqdm(word_index.items()):
        embedding_vector = pretrained.get_word_vector(word) #embeddings_index.get(word)
        words.append(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    with open(prefix+'_x_values.pkl', 'wb') as f:
        pkl.dump(xtrain_count, f)
        pkl.dump(xtrain_tfidf, f)
        pkl.dump(xtrain_tfidf_ngram, f)
        pkl.dump(xtrain_tfidf_ngram_chars, f)
        pkl.dump(train_seq_x, f)
        pkl.dump(xvalid_count, f)
        pkl.dump(xvalid_tfidf, f)
        pkl.dump(xvalid_tfidf_ngram, f)
        pkl.dump(xvalid_tfidf_ngram_chars, f)
        pkl.dump(valid_seq_x, f)

    with open(prefix+'_y_values.pkl', 'wb') as f:
        pkl.dump(train_y_sw, f)
        pkl.dump(train_y, f)
        pkl.dump(valid_y_sw, f)
        pkl.dump(valid_y, f)

    with open(prefix+'_word_index_labels_weights.pkl', 'wb') as f:
        pkl.dump(word_index, f)
        pkl.dump(labels, f)
        pkl.dump(class_weights, f)

    with open(prefix+'_embedding_matrix.pkl', 'wb') as f:
        pkl.dump(embedding_matrix, f)

In [12]:
create_vectors_embeddings(tweet_df, 'clean_data', 'sarcastic', 'tweet')

Class weight: 0.6666	class: 0
Class weight: 2.0007	class: 1

The dataset is balanced (ratio=0.333)
word level tf-idf done
ngram level tf-idf done
characters level tf-idf done


100%|██████████| 9567/9567 [00:00<00:00, 59793.90it/s]


In [13]:
create_vectors_embeddings(news_df, 'clean_data', 'is_sarcastic', 'news')

Class weight: 0.9549	class: 0
Class weight: 1.0496	class: 1

The dataset is balanced (ratio=0.91)
word level tf-idf done
ngram level tf-idf done
characters level tf-idf done


100%|██████████| 29599/29599 [00:00<00:00, 56802.58it/s]


In [14]:
os.remove("/kaggle/working/crawl-300d-2M-subword.bin")
os.remove("/kaggle/working/crawl-300d-2M-subword.vec")
os.remove("/kaggle/working/crawl-300d-2M-subword.zip")