# NLP: Sarcasm Sentiment Analysis and Prediction


## Importing and Checking Dataset


In [1]:
import pandas as pd
import numpy as np 
import nltk 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import contractions
import string
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
import textblob
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aviko\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aviko\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aviko\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\aviko\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
df = pd.read_csv("Sarcasm_Train_Dataset.csv")
df.head()

Unnamed: 0,headline,is_sarcastic
0,supreme court votes 7-2 to legalize all worldl...,1
1,hungover man horrified to learn he made dozens...,1
2,emily's list founder: women are the 'problem s...,0
3,send your kids back to school with confidence,0
4,watch: experts talk pesticides and health,0


### Checking the data(types and missing values)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44262 entries, 0 to 44261
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   headline      44262 non-null  object
 1   is_sarcastic  44262 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 691.7+ KB


In [4]:
df.isna().sum()

headline        0
is_sarcastic    0
dtype: int64

In [5]:
df.isnull().sum()

headline        0
is_sarcastic    0
dtype: int64

No missing values were found.

## Normalized Corpus Function


In [6]:
stop_words = nltk.corpus.stopwords.words('english')
def normalize_document(doc):
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    doc = contractions.fix(doc)
    tokens = nltk.word_tokenize(doc)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    doc = ' '.join(filtered_tokens)
    return doc
normalize_corpus = np.vectorize(normalize_document)

norm_corpus = normalize_corpus(list(df['headline']))
len(norm_corpus)

44262

Here I processed the text data, which involved making everything lowercase, stripping white space, removing stop words, and expanding contracted words. I do this so that this text data can be used as a normalized corpus for the unsupervised model.

## Building Model


### Splitting Data


In [7]:
X_train, X_test, y_train, y_test = train_test_split(df[['headline']], df['is_sarcastic'], test_size = 0.2, random_state=42)

print("The training set has {} rows and {} column.\nThe test set has {} rows and {} column.".
      format(X_train.shape[0], X_train.shape[1], X_test.shape[0], X_test.shape[1]))

The training set has 35409 rows and 1 column.
The test set has 8853 rows and 1 column.


Splitting the data for training and testing. 

### Count-Based Features


Using the basic NLP count-based features to analyze the data(like word count, character count, word density, etc.). 

In [8]:
import string

X_train['char_count'] = X_train['headline'].apply(len)
X_train['word_count'] = X_train['headline'].apply(lambda x: len(x.split()))
X_train['word_density'] = X_train['char_count'] / (X_train['word_count']+1)
X_train['punctuation_count'] = X_train['headline'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
X_train['title_word_count'] = X_train['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
X_train['upper_case_word_count'] = X_train['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))


X_test['char_count'] = X_test['headline'].apply(len)
X_test['word_count'] = X_test['headline'].apply(lambda x: len(x.split()))
X_test['word_density'] = X_test['char_count'] / (X_test['word_count']+1)
X_test['punctuation_count'] = X_test['headline'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
X_test['title_word_count'] = X_test['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
X_test['upper_case_word_count'] = X_test['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [9]:
X_train.head()

Unnamed: 0,headline,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count
977,rep. david cicilline: lgbt people are entitled...,65,10,5.909091,4,0,0
18954,inflating the russian threat,28,4,5.6,0,0,0
11070,former senator to run pot company,33,6,4.714286,0,0,0
34583,white liberals celebrating tomi lahren's daily...,83,12,6.384615,1,0,0
28075,marco rubio doesn't have a clue what 'oscars s...,60,11,5.0,3,0,0


### Sentiment Analysis

Using the Text Blob library to perform sentiment analysis. 

In [10]:
x_train_snt_obj = X_train['headline'].apply(lambda row: textblob.TextBlob(row).sentiment)
X_train['Polarity'] = [obj.polarity for obj in x_train_snt_obj.values]
X_train['Subjectivity'] = [obj.subjectivity for obj in x_train_snt_obj.values]

x_test_snt_obj = X_test['headline'].apply(lambda row: textblob.TextBlob(row).sentiment)
X_test['Polarity'] = [obj.polarity for obj in x_test_snt_obj.values]
X_test['Subjectivity'] = [obj.subjectivity for obj in x_test_snt_obj.values]

In [11]:
X_train.head()

Unnamed: 0,headline,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,Polarity,Subjectivity
977,rep. david cicilline: lgbt people are entitled...,65,10,5.909091,4,0,0,0.35,0.55
18954,inflating the russian threat,28,4,5.6,0,0,0,0.0,0.0
11070,former senator to run pot company,33,6,4.714286,0,0,0,0.0,0.0
34583,white liberals celebrating tomi lahren's daily...,83,12,6.384615,1,0,0,-0.066667,0.016667
28075,marco rubio doesn't have a clue what 'oscars s...,60,11,5.0,3,0,0,0.0,0.0


### Adding Bag of Words


The final step before finally building the model is to use Bag of Words, in order to turn the text data into numbers so that the prediction algorithm we use can understand what is being put in. For this, process the text to expand contractiongs, remove white space and unecessary characters, and introduce stemming. 

In [12]:
stop_words = nltk.corpus.stopwords.words('english')
stop_words.remove('no')
stop_words.remove('not')
stop_words.remove('but')
ps = nltk.porter.PorterStemmer()

def simple_text_preprocessor(document): 
    document = str(document).lower()
    document = contractions.fix(document)
    document = re.sub(r'[^a-zA-Z]',r' ', document)
    document = re.sub(r'nbsp', r'', document)
    document = re.sub(' +', ' ', document)
    document = ' '.join([ps.stem(word) for word in document.split()])
    document = ' '.join([word for word in document.split() if word not in stop_words])    
    return document
stp = np.vectorize(simple_text_preprocessor)

In [13]:
X_train['clean headline'] = stp(X_train['headline'].values)
X_test['clean headline'] = stp(X_test['headline'].values)

X_train.head()

Unnamed: 0,headline,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,Polarity,Subjectivity,clean headline
977,rep. david cicilline: lgbt people are entitled...,65,10,5.909091,4,0,0,0.35,0.55,rep david cicillin lgbt peopl entitl full equal
18954,inflating the russian threat,28,4,5.6,0,0,0,0.0,0.0,inflat russian threat
11070,former senator to run pot company,33,6,4.714286,0,0,0,0.0,0.0,former senat run pot compani
34583,white liberals celebrating tomi lahren's daily...,83,12,6.384615,1,0,0,-0.066667,0.016667,white liber celebr tomi lahren daili show inte...
28075,marco rubio doesn't have a clue what 'oscars s...,60,11,5.0,3,0,0,0.0,0.0,marco rubio doe not clue oscar white mean


In [14]:
X_train_metadata = X_train.drop(['headline', 'clean headline'], axis=1).reset_index(drop=True)
X_test_metadata = X_test.drop(['headline', 'clean headline'], axis=1).reset_index(drop=True)

X_train_metadata.head()

Unnamed: 0,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,Polarity,Subjectivity
0,65,10,5.909091,4,0,0,0.35,0.55
1,28,4,5.6,0,0,0,0.0,0.0
2,33,6,4.714286,0,0,0,0.0,0.0
3,83,12,6.384615,1,0,0,-0.066667,0.016667
4,60,11,5.0,3,0,0,0.0,0.0


In [15]:
cv = CountVectorizer(min_df=0.0, max_df=1.0, ngram_range=(1, 1))
X_traincv = cv.fit_transform(X_train['clean headline']).toarray()
X_traincv = pd.DataFrame(X_traincv, columns=cv.get_feature_names_out())

X_testcv = cv.transform(X_test['clean headline']).toarray()
X_testcv = pd.DataFrame(X_testcv, columns=cv.get_feature_names_out())
X_traincv.head()

Unnamed: 0,aaa,aaron,aarp,ab,abandon,abaya,abba,abbey,abbi,abc,...,zoo,zookeep,zooland,zoologist,zoom,zoroastrian,zsa,zucker,zuckerberg,zz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
X_train_comb = pd.concat([X_train_metadata, X_traincv], axis=1)
X_test_comb = pd.concat([X_test_metadata, X_testcv], axis=1)

X_train_comb.head()

Unnamed: 0,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,Polarity,Subjectivity,aaa,aaron,...,zoo,zookeep,zooland,zoologist,zoom,zoroastrian,zsa,zucker,zuckerberg,zz
0,65,10,5.909091,4,0,0,0.35,0.55,0,0,...,0,0,0,0,0,0,0,0,0,0
1,28,4,5.6,0,0,0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,33,6,4.714286,0,0,0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,83,12,6.384615,1,0,0,-0.066667,0.016667,0,0,...,0,0,0,0,0,0,0,0,0,0
4,60,11,5.0,3,0,0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Building Logistic Model


Finally, this is the final prediction model.

In [18]:
lr = LogisticRegression(C=1, random_state=42, solver='liblinear')
lr.fit(X_train_comb, y_train)
predictions = lr.predict(X_test_comb)

### Model Evaluation


In [19]:
print(classification_report(y_test, predictions))
pd.DataFrame(confusion_matrix(y_test, predictions))

              precision    recall  f1-score   support

           0       0.87      0.90      0.88      4711
           1       0.88      0.84      0.86      4142

    accuracy                           0.87      8853
   macro avg       0.87      0.87      0.87      8853
weighted avg       0.87      0.87      0.87      8853



Unnamed: 0,0,1
0,4240,471
1,647,3495


Based off the F1-score, are model is pretty good at predicting sarcasm in headlines, although it is slightly better at finding non-sarcastic headlines than sarcastic headlines. 

## Submission With Testing Data

In [20]:
new_test_data = pd.read_csv("Sarcasm_Test_Dataset.csv")

In [21]:
norm_corpus = normalize_corpus(list(new_test_data['headline']))
new_test_data['char_count'] = new_test_data['headline'].apply(len)
new_test_data['word_count'] = new_test_data['headline'].apply(lambda x: len(x.split()))
new_test_data['word_density'] = new_test_data['char_count'] / (new_test_data['word_count']+1)
new_test_data['punctuation_count'] = new_test_data['headline'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
new_test_data['title_word_count'] = new_test_data['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
new_test_data['upper_case_word_count'] = new_test_data['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [22]:
new_test_data.head()

Unnamed: 0,headline,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count
0,area stand-up comedian questions the deal with...,65,9,6.5,2,0,0
1,dozens of glowing exit signs mercilessly taunt...,65,9,6.5,0,0,0
2,perfect response to heckler somewhere in prop ...,62,9,6.2,1,0,0
3,gop prays for ossoff lossoff,28,5,4.666667,0,0,0
4,trevor noah says the scary truth about trump's...,65,11,5.416667,1,0,0


In [23]:
new_test_data_snt_obj = new_test_data['headline'].apply(lambda row: textblob.TextBlob(row).sentiment)
new_test_data['Polarity'] = [obj.polarity for obj in new_test_data_snt_obj.values]
new_test_data['Subjectivity'] = [obj.subjectivity for obj in new_test_data_snt_obj.values]

In [24]:
new_test_data.head()

Unnamed: 0,headline,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,Polarity,Subjectivity
0,area stand-up comedian questions the deal with...,65,9,6.5,2,0,0,0.0,0.0
1,dozens of glowing exit signs mercilessly taunt...,65,9,6.5,0,0,0,-0.7,1.0
2,perfect response to heckler somewhere in prop ...,62,9,6.2,1,0,0,1.0,1.0
3,gop prays for ossoff lossoff,28,5,4.666667,0,0,0,0.0,0.0
4,trevor noah says the scary truth about trump's...,65,11,5.416667,1,0,0,0.0,0.8


In [25]:
new_test_data['clean headline'] = stp(new_test_data['headline'].values)

In [26]:
new_test_data.head()

Unnamed: 0,headline,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,Polarity,Subjectivity,clean headline
0,area stand-up comedian questions the deal with...,65,9,6.5,2,0,0,0.0,0.0,area stand comedian question deal drive thru w...
1,dozens of glowing exit signs mercilessly taunt...,65,9,6.5,0,0,0,-0.7,1.0,dozen glow exit sign mercilessli taunt multipl...
2,perfect response to heckler somewhere in prop ...,62,9,6.2,1,0,0,1.0,1.0,perfect respons heckler somewher prop comedian...
3,gop prays for ossoff lossoff,28,5,4.666667,0,0,0,0.0,0.0,gop pray ossoff lossoff
4,trevor noah says the scary truth about trump's...,65,11,5.416667,1,0,0,0.0,0.8,trevor noah say scari truth trump rumor love c...


In [27]:
new_test_metadata = new_test_data.drop(['headline', 'clean headline'], axis=1).reset_index(drop=True)

In [28]:
new_test_metadata.head()

Unnamed: 0,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,Polarity,Subjectivity
0,65,9,6.5,2,0,0,0.0,0.0
1,65,9,6.5,0,0,0,-0.7,1.0
2,62,9,6.2,1,0,0,1.0,1.0
3,28,5,4.666667,0,0,0,0.0,0.0
4,65,11,5.416667,1,0,0,0.0,0.8


In [29]:
new_test_data_cv = cv.transform(new_test_data['clean headline']).toarray()
new_test_data_cv = pd.DataFrame(new_test_data_cv, columns=cv.get_feature_names())



In [30]:
new_test_data_cv.head()

Unnamed: 0,aaa,aaron,aarp,ab,abandon,abaya,abba,abbey,abbi,abc,...,zoo,zookeep,zooland,zoologist,zoom,zoroastrian,zsa,zucker,zuckerberg,zz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
new_test_data_comb = pd.concat([new_test_metadata, new_test_data_cv], axis=1)

In [32]:
new_test_data_comb.head()

Unnamed: 0,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,Polarity,Subjectivity,aaa,aaron,...,zoo,zookeep,zooland,zoologist,zoom,zoroastrian,zsa,zucker,zuckerberg,zz
0,65,9,6.5,2,0,0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,65,9,6.5,0,0,0,-0.7,1.0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,62,9,6.2,1,0,0,1.0,1.0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,28,5,4.666667,0,0,0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,65,11,5.416667,1,0,0,0.0,0.8,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
target = lr.predict(new_test_data_comb)

In [34]:
res = pd.DataFrame(target) 
res.index = new_test_data_comb.index 
res.columns = ["prediction"]
res.to_csv("sarcasm_prediction_results.csv", index = False)

Final model accuracy for the prediction test data was 87.60166275076811, which is around what I expected.