In [None]:
import pandas as pd
import CMUTweetTagger
from collections import OrderedDict, defaultdict, Counter
import pandas as pd
import csv
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
import emot

In [None]:
#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tweet_tokenizer = TweetTokenizer()
    stopword_list=nltk.corpus.stopwords.words('english')
    tokens = tweet_tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [None]:
def remove_url(sample):
    return re.sub(r"http\S+", "", sample)

In [None]:
def clean_df(file):
    data = pd.read_csv(file, sep='\t', names=["id", "polarity", "tweet"])
    data = data.drop_duplicates()
    data['tweet']=data['tweet'].apply(remove_stopwords)
    data['tweet']=data['tweet'].apply(remove_url)
    data["tweaet"] = data["tweet"].str.lower() # lowercase
    data = data.reset_index(drop=True)
    return data

In [None]:
def get_emoji_dict(only_tweet_data):
    store = defaultdict(float)
    for tweet in only_tweet_data:
        emojis = emot.emoji(tweet)
        if len(emojis) > 0 and emojis['flag'] != False:
            for each_emoji in emojis['value']:
                store[each_emoji] += 1
    return store

In [None]:
def detect_emojis(tweet):
    for c in tweet:
        if '0x{:x}'.format(ord(c)) in list(emoji_data["Unicode codepoint"]):
            return True
    return False

In [None]:
def include_emoji(tweet):
    if detect_emojis(tweet):
        return {'include_emoji:' : 1}
    else:
        return {'include_emoji:' : 0}

In [None]:
def polarity(token):
    score = 0
    if isinstance(emoj_dict[token], list):
        score = emoj_dict[token][1]
    if score > 0:
        return 'positive'
    if score < 0:
        return 'negative'

In [None]:
def last_token(tweet, tokenizer):
    for token in reversed(tokenizer.tokenize(tweet)):
        token = token.lower()
        if polarity(token) == 'positive':
            return {'last_polarity' : 1}
        if polarity(token) == 'negative':
            return {'last_polarity' : -1}
        else:
            return {'last_polarity' : 0}

In [None]:
def all_feats(tweet, tokenizer):
    last = last_token(tweet, tokenizer)
    emoji = include_emoji(tweet)
    result = dict()
    for dictionary in [emoji, last]:
        result.update(dictionary)
    return result

In [None]:
def get_feature(only_tweet_data, tokenizer):
    emo_counts = [all_feats(tweet, tokenizer) for tweet in only_tweet_data]
    emo_df = pd.DataFrame(emo_counts, index=only_tweet_data.index)
    emo_df = emo_df.fillna(0)
    emo_np = emo_df.to_numpy()
    return emo_np

In [None]:
emoj_dict = defaultdict(float)
emoji_data = pd.read_csv('./data/dataset/emoji_data_1.csv')
for i in range(len(emoji_data)):
    values = []
    unicode = emoji_data["Unicode codepoint"][i]
    sentiment_score = emoji_data["Sentiment score [-1...+1]"][i]
    description = emoji_data["Unicode name"][i]
    values.extend((unicode, sentiment_score, description))
    emoj_dict[emoji_data["Emoji"][i]] = values

In [None]:
tweet_tokenizer = TweetTokenizer()
train_data = clean_df("./data/dataset/final_train_data.csv")
only_tweet_train_data = train_data['tweet']
train_emo_feature = get_feature(only_tweet_train_data, tweet_tokenizer)
train_labels = train_data.polarity
result = []
for x in train_labels:
    if x == "positive":
        result.append(2)
    elif x == "negative":
        result.append(1)
    elif x == "neutral":
        result.append(0)
train_labels = np.array(result)
scaler = StandardScaler()
train_emo_feature = scaler.fit_transform(train_emo_feature)
train_features = np.array(train_emo_feature)

print("train labels: ", train_labels) 
print("train features:", train_features) 
print("train labels shape: ", train_labels.shape) 
print("train features shape:", train_features.shape)

In [None]:
tweet_tokenizer = TweetTokenizer()
data_path = input("Please type in the file path for developing/testing data:")
dev_data = clean_df(data_path)
only_tweet_dev_data = dev_data['tweet']
dev_Sentiment140_uni_feature = get_feature(only_tweet_dev_data, tweet_tokenizer)

dev_labels = dev_data.polarity
result = []
for x in dev_labels:
    if x == "positive":
        result.append(2)
    elif x == "negative":
        result.append(1)
    elif x == "neutral":
        result.append(0)
dev_labels = np.array(result)
scaler = StandardScaler()
dev_Sentiment140_uni_feature = scaler.fit_transform(dev_Sentiment140_uni_feature)
dev_features = np.array(dev_Sentiment140_uni_feature)

print("dev labels: ", dev_labels) 
print("dev features:", dev_features) 
print("dev_labels shape: ", dev_labels.shape) 
print("dev_features shape:", dev_features.shape) 