In [None]:
import pandas as pd
import CMUTweetTagger
from collections import OrderedDict, defaultdict, Counter
import pandas as pd
import csv
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
import emot

In [None]:
#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tweet_tokenizer = TweetTokenizer()
    stopword_list=nltk.corpus.stopwords.words('english')
    tokens = tweet_tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [None]:
def remove_url(sample):
    return re.sub(r"http\S+", "", sample)

In [None]:
def clean_df(file):
    data = pd.read_csv(file, sep='\t', names=["id", "polarity", "tweet"])
    data = data.drop_duplicates()
    data['tweet']=data['tweet'].apply(remove_stopwords)
    data['tweet']=data['tweet'].apply(remove_url)
    data["tweaet"] = data["tweet"].str.lower() # lowercase
    data = data.reset_index(drop=True)
    return data

In [None]:
def polarity(token):
    score = 0
    if isinstance(emoj_dict[token], list) and len(emoj_dict[token]) >= 3:
        score = emoj_dict[token][1]
    if isinstance(emoticon_dict[token], list) and len(emoticon_dict[token]) >= 2: 
        score = emoticon_dict[token][1]
        if score != emoj_dict[emoticon_dict[token][0]][1]:
            print("wrong", emoticon_dict[token][0])
    if score > 0:
        return 'positive'
    if score < 0:
        return 'negative'
    else:
        return 'none'

In [None]:
def check_score(token):
    score = 0
    if isinstance(emoj_dict[token], list) and len(emoj_dict[token]) >= 3:
        return emoj_dict[token][1]
    if isinstance(emoticon_dict[token], list) and len(emoticon_dict[token]) >= 2: 
        return emoticon_dict[token][1]
    return 0

In [None]:
def count_tokens_with_polarity(tweet, tokenizer):
    polarity_list = []
    for token in tokenizer.tokenize(tweet):
        token = token.lower()
        polarity_val = polarity(token)
        if polarity_val == 'positive' or polarity_val == 'negative':
            polarity_list.append(polarity_val)
    return dict(Counter(polarity_list))

In [None]:
def polarity_sum(tweet, tokenizer):
    negList = []
    posList = []
    for token in tokenizer.tokenize(tweet):
        token = token.lower()
        if polarity(token) == 'positive':
            posList.append(check_score(token))
        elif polarity(token) == 'negative':
            negList.append(check_score(token))
        
    return {'pos_sum' : sum(posList), 'total_sum' : sum(posList) + sum(negList)}

In [None]:
def include_emo(tweet, tokenizer):
    include_emo = {'include_emo:' : 0}
    for token in tokenizer.tokenize(tweet):
        token = token.lower()
        polarity_val = polarity(token)
        if polarity_val == 'positive' or polarity_val == 'negative':
            include_emo = {'include_emo:' : 1}
    return include_emo

In [None]:
def max_token(tweet, tokenizer):
    
    neg_list = []
    pos_list = []
    
    for token in tokenizer.tokenize(tweet):
        token = token.lower()
        if polarity(token) == 'positive':
            pos_list.append(check_score(token))
        elif polarity(token) == 'negative':
            neg_list.append(check_score(token))
        
    try:
        pos_max = max(pos_list)
    except ValueError:
        pos_max = 0
    try:
        neg_max = min(neg_list)
    except ValueError:
        neg_max = 0
        
    return {'pos_max' : pos_max, 'neg_max' : neg_max}

In [None]:
def last_token(tweet, tokenizer):
    for token in reversed(tokenizer.tokenize(tweet)):
        token = token.lower()
        if polarity(token) == 'positive':
            return {'last_polarity' : 1} # or score = emoj_dict[token][1]
        if polarity(token) == 'negative':
            return {'last_polarity' : -1}
        else:
            return {'last_polarity' : 0}
    return {'last_polarity' : 0}

In [None]:
def all_feats(tweet, tokenizer):
    count_tokens = count_tokens_with_polarity(tweet, tokenizer)
    polarity = polarity_sum(tweet, tokenizer)
    max_tok = max_token(tweet, tokenizer)
    last_tok = last_token(tweet, tokenizer)
    emoji = include_emo(tweet, tokenizer)
    result = dict()
    for dictionary in [count_tokens, polarity, max_tok, last_tok, emoji]:
        result.update(dictionary)
    return result

In [None]:
def get_feature(only_tweet_data, tokenizer):
    emo_counts = [all_feats(tweet, tokenizer) for tweet in only_tweet_data]
    emo_df = pd.DataFrame(emo_counts, index=only_tweet_data.index)
    emo_df = emo_df.fillna(0)
    emo_np = emo_df.to_numpy()
    return emo_np

In [None]:
emoj_dict = defaultdict(float)
emoji_data = pd.read_csv('./data/dataset/emoji_data_1.csv')
for i in range(len(emoji_data)):
    values = []
    unicode = emoji_data["Unicode codepoint"][i]
    sentiment_score = emoji_data["Sentiment score [-1...+1]"][i]
    description = emoji_data["Unicode name"][i]
    values.extend((unicode, sentiment_score, description))
    emoj_dict[emoji_data["Emoji"][i]] = values

In [None]:
emoticon_dict = defaultdict(float)
emoticon_data = pd.read_csv('./data/dataset/emoticon-emoji-mapping.csv')
emoticon_data.drop('Unnamed: 3', inplace=True, axis=1)
emoticon_data.drop('Unnamed: 4', inplace=True, axis=1)

for i in range(len(emoticon_data)):
    values = []
    emoji = emoticon_data["emoji"][i]
    sentiment_score = emoticon_data["sentiment score"][i]
    values.extend((emoji, sentiment_score))
    emoticon_data["emoticon"][i] = emoticon_data["emoticon"][i].strip()
    emoticon_dict[emoticon_data["emoticon"][i]] = values

In [None]:
tweet_tokenizer = TweetTokenizer()
train_data = clean_df("./data/dataset/2013_train_sarcasm_data.csv")
only_tweet_train_data = train_data['tweet']
train_emo_feature = get_feature(only_tweet_train_data, tweet_tokenizer)
train_labels = train_data.polarity
result = []
for x in train_labels:
    if x == "positive":
        result.append(2)
    elif x == "negative":
        result.append(1)
    elif x == "neutral":
        result.append(0)
train_labels = np.array(result)
scaler = StandardScaler()
train_emo_feature = scaler.fit_transform(train_emo_feature)
train_features = np.array(train_emo_feature)

print("train labels: ", train_labels) 
print("train features:", train_features) 
print("train labels shape: ", train_labels.shape) 
print("train features shape:", train_features.shape)

In [None]:
file = open("emoji_emoticon_8_features_train_vector.txt", "w+")
for i in train_emo_feature:
    content = str(i)
    file.write(content)
file.close()

# Test

In [None]:
tweet_tokenizer = TweetTokenizer()
dev_data = clean_df("./data/dataset/2013_test_sarcasm_data.csv")
only_tweet_dev_data = dev_data['tweet']
dev_Sentiment140_uni_feature = get_feature(only_tweet_dev_data, tweet_tokenizer)

dev_labels = dev_data.polarity
result = []
for x in dev_labels:
    if x == "positive":
        result.append(2)
    elif x == "negative":
        result.append(1)
    elif x == "neutral":
        result.append(0)
dev_labels = np.array(result)
scaler = StandardScaler()
dev_Sentiment140_uni_feature = scaler.fit_transform(dev_Sentiment140_uni_feature)
dev_features = np.array(dev_Sentiment140_uni_feature)

print("dev labels: ", dev_labels) 
print("dev features:", dev_features) 
print("dev_labels shape: ", dev_labels.shape) 
print("dev_features shape:", dev_features.shape) 

In [None]:
file = open("emoji_emoticon_8_features_test_vector.txt", "w+")
for i in dev_Sentiment140_uni_feature:
    content = str(i)
    file.write(content)
file.close()

In [None]:

clf = SVC(kernel='linear', C=0.005, probability=True)

sample_weight = np.array([3.14 if i == 1 else 1 for i in train_labels])

clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=0.005 \nNegative weight = 3.14 \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))




clf = SVC(kernel='linear', C=1, probability=True)

sample_weight = np.array([3.14 if i == 1 else 1 for i in train_labels])

clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=1 \nNegative weight = 3.14 \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))





clf = SVC(kernel='linear', C=0.005, probability=True)

arr = []

for i in train_labels:
    if i == 1:
        arr.append(3.14)
    elif i == 2:
        arr.append(1.25)
    else:
        arr.append(1)


sample_weight = np.array(arr)
clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=0.005 \nNegative weight = 3.14 \nPositive weight = 1.25 \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))


clf = SVC(kernel='linear', C=1, probability=True)

arr = []

for i in train_labels:
    if i == 1:
        arr.append(3.14)
    elif i == 2:
        arr.append(1.25)
    else:
        arr.append(1)


sample_weight = np.array(arr)
clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=1 \nNegative weight = 3.14 \nPositive weight = 1.25 \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))



clf = SVC(kernel='linear', C=1, probability=True)

clf.fit(train_features, train_labels)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=1 \nNo weight", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))


clf = SVC(kernel='linear', C=0.005, probability=True)

clf.fit(train_features, train_labels)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=0.005 \nNo weight", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))

from sklearn.model_selection import GridSearchCV
  
param_grid = {'C': [0.005, 0.1, 0.5, 1], 
              'kernel': ['linear','rbf']} 
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
  
grid.fit(train_features, train_labels)

print(grid.best_params_)
  
print(grid.best_estimator_)

grid_predictions = grid.predict(dev_features)
  
print(classification_report(dev_labels, grid_predictions))

In [None]:
tweet_tokenizer = TweetTokenizer()
dev_data = clean_df("./data/dataset/2013_dev_sarcasm_data.csv")
only_tweet_dev_data = dev_data['tweet']
dev_Sentiment140_uni_feature = get_feature(only_tweet_dev_data, tweet_tokenizer)

dev_labels = dev_data.polarity
result = []
for x in dev_labels:
    if x == "positive":
        result.append(2)
    elif x == "negative":
        result.append(1)
    elif x == "neutral":
        result.append(0)
dev_labels = np.array(result)
scaler = StandardScaler()
dev_Sentiment140_uni_feature = scaler.fit_transform(dev_Sentiment140_uni_feature)
dev_features = np.array(dev_Sentiment140_uni_feature)

print("dev labels: ", dev_labels) 
print("dev features:", dev_features) 
print("dev_labels shape: ", dev_labels.shape) 
print("dev_features shape:", dev_features.shape) 

In [None]:

clf = SVC(kernel='linear', C=0.005, probability=True)

sample_weight = np.array([3.14 if i == 1 else 1 for i in train_labels])

clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=0.005 \nNegative weight = 3.14 \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))




clf = SVC(kernel='linear', C=1, probability=True)

sample_weight = np.array([3.14 if i == 1 else 1 for i in train_labels])

clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=1 \nNegative weight = 3.14 \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))


clf = SVC(kernel='linear', C=0.005, probability=True)

arr = []

for i in train_labels:
    if i == 1:
        arr.append(3.14)
    elif i == 2:
        arr.append(1.25)
    else:
        arr.append(1)


sample_weight = np.array(arr)
clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=0.005 \nNegative weight = 3.14 \nPositive weight = 1.25 \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))


clf = SVC(kernel='linear', C=1, probability=True)

arr = []

for i in train_labels:
    if i == 1:
        arr.append(3.14)
    elif i == 2:
        arr.append(1.25)
    else:
        arr.append(1)


sample_weight = np.array(arr)
clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=1 \nNegative weight = 3.14 \nPositive weight = 1.25 \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))



clf = SVC(kernel='linear', C=1, probability=True)

clf.fit(train_features, train_labels)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=1 \nNo weight", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))


clf = SVC(kernel='linear', C=0.005, probability=True)

clf.fit(train_features, train_labels)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=0.005 \nNo weight", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))


from sklearn.model_selection import GridSearchCV
  
param_grid = {'C': [0.005, 0.1, 0.5, 1], 
              'kernel': ['linear','rbf']} 
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
  
grid.fit(train_features, train_labels)

print(grid.best_params_)
  
print(grid.best_estimator_)

grid_predictions = grid.predict(dev_features)
  
print(classification_report(dev_labels, grid_predictions))