### This will include more analysis on top of previous Srcasm Detection.
### Which will inlcude Word3vec,MLP models

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim.downloader

from sklearn.neural_network import MLPClassifier

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB

In [5]:
file_name='train-balanced-sarcasm.csv'

In [6]:
train_df = pd.read_csv(file_name)

In [7]:
train_df.dropna(axis=0,subset=['comment'],inplace=True)

In [8]:
train_df[train_df.isnull().any(axis=1)]['comment']

Series([], Name: comment, dtype: object)

In [9]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess(row):
    row = row.lower()
    row = re.sub(r'[^\w\s]', '', row)
    words = row.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words) if words else 'empty'

        

In [10]:
def get_features(X_train,train_data=False):
    X=[]
    for row in X_train:
        row=preprocess(row)
        X.append(row)
    if(train_data==True):
        X_tfidf=tfidf.fit_transform(X)
    else:
        X_tfidf=tfidf.transform(X)
    return X_tfidf
    
        

In [11]:
def train_model(X_train,y_train,model):
    X=get_features(X_train,train_data=True)
    model.fit(X,y_train)
    return model

In [12]:
X_train,X_test,y_train,y_pred=train_test_split(train_df['comment'],train_df['label'],test_size=0.3,train_size=0.7)
tfidf = TfidfVectorizer(stop_words='english',ngram_range=(1, 2))
LR_model=LogisticRegression(random_state=42,max_iter=1000)
model=train_model(X_train,y_train,LR_model)

In [13]:
def model_predict(trained_model,X_test,y_test):
    X=get_features(X_test,train_data=False)
    predictions=trained_model.predict(X)
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    cm=confusion_matrix(y_test,predictions)
    # Print metrics for analysis
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print(cm)

In [14]:
model_predict(model,X_test,y_pred)

Accuracy: 0.68
Precision: 0.69
Recall: 0.67
F1 Score: 0.68
[[105114  46295]
 [ 49653 102170]]


In [15]:
#X_train,X_test,y_train,y_pred=train_test_split(train_df['comment'],train_df['label'],test_size=0.3,train_size=0.7)
#tfidf = TfidfVectorizer(stop_words='english')
nb=MultinomialNB()
model=train_model(X_train,y_train,nb)

In [16]:
model_predict(model,X_test,y_pred)

Accuracy: 0.68
Precision: 0.67
Recall: 0.69
F1 Score: 0.68
[[ 99964  51445]
 [ 46703 105120]]


## Working On LR with Word 2 vec

In [17]:
import gensim.downloader

In [18]:
word2Vec = gensim.downloader.load('glove-twitter-200')

In [19]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(row):
    row = row.lower()
    row = re.sub(r'[^\w\s]', '', row)
    words = row.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words) if words else 'empty'

        

In [20]:
def create_vector_with_word2vec(texts):
    training_vectors=[]
    for row in texts:
        row = preprocess(row)  # Preprocessing remains consistent
        words = row.split(" ")
        vector_list = []
        for word in words:
            if word in word2Vec:  # Ensure word is in the GloVe model
                vector_list.append(word2Vec[word])
        if vector_list:
            training_vectors.append(np.max(vector_list, axis=0))  # Simple average across vectors
        else:
            training_vectors.append(np.zeros(word2Vec.vector_size))# Return zeros if no vectors found
    return training_vectors

In [21]:
def train_LR_with_word2vec(X_train,y_train,LR_model):
    return LR_model.fit(X_train,y_train)
    

In [27]:
X_train,X_test,y_train,y_test=train_test_split(train_df['comment'],train_df['label'],test_size=0.2,train_size=0.8)
LR_model=LogisticRegression(random_state=42,max_iter=1000,class_weight='balanced',C=1)
training_vectors=create_vector_with_word2vec(X_train)
model=train_LR_with_word2vec(training_vectors,y_train,LR_model)

In [28]:
def model_predict(trained_model,X_test,y_test):
    def get_text_vector(text):
        text = preprocess(text)
        words = text.split(" ")
        vector_list = []
        for word in words:
            if word in word2Vec:
                vector_list.append(word2Vec[word])
        if vector_list:
            return np.mean(vector_list, axis=0)
        else:
            return np.zeros(word2Vec.vector_size)  # Zero vector if no valid words
    X_test_vec=[]
    for row in X_test:
        X_test_vec.append(get_text_vector(row))
    predictions=trained_model.predict(X_test_vec)
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    cm=confusion_matrix(y_test,predictions)
    # Print metrics for analysis
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print(cm)
    tn, fp, fn, tp = confusion_matrix(y_test,predictions).ravel()
    print(tn, fp, fn, tp)

In [29]:
model_predict(model,X_test,y_test)

Accuracy: 0.56
Precision: 0.69
Recall: 0.21
F1 Score: 0.32
[[91212  9555]
 [80129 21259]]
91212 9555 80129 21259


In [31]:
#MLP

In [32]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(row):
    row = row.lower()
    row = re.sub(r'[^\w\s]', '', row)
    words = row.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words) if words else 'empty'

        

In [33]:
# Function to get the average vector for a set of reviews
def get_average_vector(texts,label):    
    training_vectors=[]
    count_label=[]
    for row in texts:
        row = preprocess(row)  # Preprocessing remains consistent
        words = row.split(" ")
        vector_list = []
        for word in words:
            if word in word2Vec:  # Ensure word is in the GloVe model
                vector_list.append(word2Vec[word])
        if vector_list:
            training_vectors.append(np.mean(vector_list, axis=0))  # Simple average across vectors
        else:
            training_vectors.append(np.zeros(word2Vec.vector_size))# Return zeros if no vectors found
        count_label.append(label)
    return training_vectors,count_label

In [40]:
def train_MLP_model_average(train_df):
    #word2Vec = gensim.downloader.load('glove-twitter-25')
    positive_labels=train_df[train_df['label']==1]['comment']
    negative_labels=train_df[train_df['label']==0]['comment']
    positive_vector,positive_label=get_average_vector(positive_labels,1)
    negative_vector,negative_label=get_average_vector(negative_labels,0)
    X_train=positive_vector+negative_vector
    y_train=positive_label+negative_label
    mlp = MLPClassifier(hidden_layer_sizes=(100, 100, 100), max_iter=500, random_state=42)
    # Convert lists to numpy arrays for model training
    print(len(X_train),len(y_train))
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    mlp.fit(X_train,y_train)
    return mlp

In [38]:
from sklearn.model_selection import train_test_split

train,test = train_test_split(train_df, test_size=0.2, random_state=42)

In [41]:
mlp=train_MLP_model_average(train)

808616 808616


In [42]:
def test_MLP_model(test, MLP_model, input_type='average'):
    '''df_test = pd.read_csv(path_to_test_file, sep='\t', header=None)
    df_test = df_test.rename(columns={0: 'text', 1: 'label'})'''

    # Helper function to choose between average and max pooling
    def get_text_vector(text):
        text = preprocess(text)
        words = text.split(" ")
        vector_list = []
        for word in words:
            if word in word2Vec:
                vector_list.append(word2Vec[word])
        if vector_list:
            if input_type == 'average':
                return np.mean(vector_list, axis=0)  # Average vector
        else:
            return np.zeros(word2Vec.vector_size)  # Zero vector if no valid words

    # Create vectors for the test set
    X_test = np.array([get_text_vector(row) for row in test['comment']])
    y_test = test['label'].values
    
    # Get predictions and probabilities
    y_pred = MLP_model.predict(X_test)
    y_prob = MLP_model.predict_proba(X_test)[:, 1]  # Probability of being positive

    # Add new columns to the test dataframe
    test['probability_positive'] = y_prob
    test['predicted_class'] = y_pred

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    conf_matrix = confusion_matrix(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='binary')

    # Print metrics
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print(f"Confusion Matrix:\n{conf_matrix}")


In [43]:
test_MLP_model(test,mlp) #twitter 200 without smote

Accuracy: 0.650495906606317
Precision: 0.6635338547533983
Recall: 0.6114950711397186
F1 Score: 0.6364525125292005
Confusion Matrix:
[[69655 31361]
 [39293 61846]]
