In [1]:
import CMUTweetTagger
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import TweetTokenizer
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score

In [2]:
#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tweet_tokenizer = TweetTokenizer()
    stopword_list=nltk.corpus.stopwords.words('english')
    tokens = tweet_tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [3]:
def clean_df(file):
    data = pd.read_csv(file, sep='\t', names=["id", "polarity", "tweet"])
    data = data.drop_duplicates()
    data['tweet']=data['tweet'].apply(remove_stopwords)
    data["tweet"] = data["tweet"].str.lower() # lowercase
    data = data.reset_index(drop=True)
    return data

In [4]:
def get_cmu_tagger(only_tweet_data):
    return CMUTweetTagger.runtagger_parse(only_tweet_data.values, run_tagger_cmd="java -XX:ParallelGCThreads=2 -Xmx500m -jar ./ark-tweet-nlp-0.3.2/ark-tweet-nlp-0.3.2.jar")

In [6]:
def get_adj_word_dict(only_tweet_data):
    pos_count = get_cmu_tagger(only_tweet_data)
    result = []
    adj_words = dict()
    for sentence in pos_count:
        for word_tuple in sentence:
            if word_tuple[1] == 'A': # 'R' is adverb
                if word_tuple[0] in adj_words.keys():
                    adj_words[word_tuple[0]] += 1
                else:
                    adj_words[word_tuple[0]] = 1
    return adj_words

In [7]:
def get_adj_feature_array(only_tweet_data):
    adj_word_dict = get_adj_word_dict(only_tweet_data)
    prune_features = []
    for key, value in adj_word_dict.items():
        if (value >= 0): 
            prune_features.append(key)
    return prune_features

In [8]:
def get_adv_word_dict(only_tweet_data):
    pos_count = get_cmu_tagger(only_tweet_data)
    result = []
    adv_words = dict()
    for sentence in pos_count:
        for word_tuple in sentence:
            if word_tuple[1] == 'R': # 'R' is adverb
                if word_tuple[0] in adv_words.keys():
                    adv_words[word_tuple[0]] += 1
                else:
                    adv_words[word_tuple[0]] = 1
    return adv_words

In [9]:
def get_adv_feature_array(only_tweet_data):
    adv_word_dict = get_adv_word_dict(only_tweet_data)
    prune_features = []
    for key, value in adv_word_dict.items():
        if (value >= 0): 
            prune_features.append(key)
    return prune_features

In [10]:
def get_feature_array(only_tweet_data):
    return get_adj_feature_array(only_tweet_data) + get_adv_feature_array(only_tweet_data)

In [11]:
def get_feature(only_tweet_data, features_array):
    tweet_tokenizer = TweetTokenizer()
    feature_result = []
    for tweet in only_tweet_data:
        tokens = tweet_tokenizer.tokenize(tweet)
        
        token_dict = dict()
        tweet_arr = []
        for token in tokens:
            if token in token_dict.keys():
                token_dict[token] += 1
            else:
                token_dict[token] = 1

        for feature in features_array:
            if feature in token_dict.keys():
                tweet_arr.append(token_dict[feature])
            else:
                tweet_arr.append(0)
        feature_result.append(tweet_arr)
    return feature_result    

In [12]:
train_data = clean_df("./data/dataset/train_without_sarcasm")
only_tweet_train_data = train_data['tweet']
features_array = get_feature_array(only_tweet_train_data)
train_adj_feature = get_feature(only_tweet_train_data, features_array)

train_labels = train_data.polarity
result = []
for x in train_labels:
    if x == "positive":
        result.append(2)
    elif x == "negative":
        result.append(1)
    elif x == "neutral":
        result.append(0)
train_labels = np.array(result)

scaler = StandardScaler()
train_adj_feature = scaler.fit_transform(train_adj_feature)

train_features = np.array(train_adj_feature)


print("train labels: ", train_labels) 
print("train features:", train_features) 
print("train labels shape: ", train_labels.shape) 
print("train features shape:", train_features.shape)

train labels:  [0 1 2 ... 0 2 1]
train features: [[-8.03556375e-03 -8.03556375e-03 -1.60726843e-02 ... -8.03556375e-03
  -8.03556375e-03 -8.03556375e-03]
 [ 1.24446776e+02  1.24446776e+02  6.22173609e+01 ... -8.03556375e-03
  -8.03556375e-03 -8.03556375e-03]
 [-8.03556375e-03 -8.03556375e-03 -1.60726843e-02 ... -8.03556375e-03
  -8.03556375e-03 -8.03556375e-03]
 ...
 [-8.03556375e-03 -8.03556375e-03 -1.60726843e-02 ... -8.03556375e-03
  -8.03556375e-03 -8.03556375e-03]
 [-8.03556375e-03 -8.03556375e-03 -1.60726843e-02 ... -8.03556375e-03
  -8.03556375e-03 -8.03556375e-03]
 [-8.03556375e-03 -8.03556375e-03 -1.60726843e-02 ... -8.03556375e-03
  -8.03556375e-03 -8.03556375e-03]]
train labels shape:  (15488,)
train features shape: (15488, 3249)


In [14]:
dev_data = clean_df("./data/dataset/twitter-2013test-A.txt")
only_tweet_dev_data = dev_data['tweet']
features_array = get_feature_array(only_tweet_train_data)
dev_adj_feature = get_feature(only_tweet_dev_data, features_array)

dev_labels = dev_data.polarity
result = []
for x in dev_labels:
    if x == "positive":
        result.append(2)
    elif x == "negative":
        result.append(1)
    elif x == "neutral":
        result.append(0)
dev_labels = np.array(result)

scaler = StandardScaler()
dev_adj_feature = scaler.fit_transform(dev_adj_feature)

dev_features = np.array(dev_adj_feature)

print("dev labels: ", dev_labels) 
print("dev features:", dev_features) 
print("dev_labels shape: ", dev_labels.shape) 
print("dev_features shape:", dev_features.shape) 

dev labels:  [2 2 0 ... 1 2 0]
dev features: [[-0.06733396 -0.07902327 -0.05045046 ...  0.          0.
  -0.08366348]
 [-0.06733396 -0.07902327 -0.05045046 ...  0.          0.
  -0.08366348]
 [-0.06733396 -0.07902327 -0.05045046 ...  0.          0.
  -0.08366348]
 ...
 [-0.06733396 -0.07902327 -0.05045046 ...  0.          0.
  -0.08366348]
 [-0.06733396 -0.07902327 -0.05045046 ...  0.          0.
  -0.08366348]
 [-0.06733396 -0.07902327 -0.05045046 ...  0.          0.
  -0.08366348]]
dev_labels shape:  (3545,)
dev_features shape: (3545, 2113)


In [15]:
file = open("no_feature_reduc_adg_adv_vector.txt", "w+")
for i in dev_adj_feature:
    content = str(i)
    file.write(content)
file.close()

In [16]:
dev_data = clean_df("./data/dataset/twitter-2013dev-A.txt")
only_tweet_dev_data = dev_data['tweet']
features_array = get_feature_array(only_tweet_train_data)
dev_adj_feature = get_feature(only_tweet_dev_data, features_array)

dev_labels = dev_data.polarity
result = []
for x in dev_labels:
    if x == "positive":
        result.append(2)
    elif x == "negative":
        result.append(1)
    elif x == "neutral":
        result.append(0)
dev_labels = np.array(result)

scaler = StandardScaler()
dev_adj_feature = scaler.fit_transform(dev_adj_feature)

dev_features = np.array(dev_adj_feature)

print("dev labels: ", dev_labels) 
print("dev features:", dev_features) 
print("dev_labels shape: ", dev_labels.shape) 
print("dev_features shape:", dev_features.shape) 

dev labels:  [0 0 1 ... 0 0 0]
dev features: [[-0.07405714 -0.0652725  -0.02462576 ...  0.          0.
  -0.07124705]
 [-0.07405714 -0.0652725  -0.02462576 ...  0.          0.
  -0.07124705]
 [-0.07405714 -0.0652725  -0.02462576 ...  0.          0.
  -0.07124705]
 ...
 [-0.07405714 -0.0652725  -0.02462576 ...  0.          0.
  -0.07124705]
 [-0.07405714 -0.0652725  -0.02462576 ...  0.          0.
  -0.07124705]
 [-0.07405714 -0.0652725  -0.02462576 ...  0.          0.
  -0.07124705]]
dev_labels shape:  (1650,)
dev_features shape: (1650, 2113)


In [18]:

clf = SVC(kernel='linear', C=0.005, probability=True)


clf.fit(train_features, train_labels)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=0.005 \nNegative weight = 3.14 \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))



clf = SVC(kernel='linear', C=1, probability=True)

clf.fit(train_features, train_labels)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=1 \nNegative weight = 3.14 \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))






clf = SVC(kernel='linear', C=1, probability=True)

sample_weight = np.array([3.14 if i == 1 else 1 for i in train_labels])

clf.fit(train_features, train_labels, sample_weight = sample_weight)
# clf.fit(train_features, train_labels)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=1 \nNegative weight = 3.14 \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))






A Classification Report showing the per-class Precision, Recall and F1-score 

C=0.005 
Negative weight = 3.14 

               precision    recall  f1-score   support

     neutral       0.58      0.84      0.68       737
    negative       0.61      0.23      0.33       340
    positive       0.63      0.49      0.55       573

    accuracy                           0.59      1650
   macro avg       0.60      0.52      0.52      1650
weighted avg       0.60      0.59      0.56      1650

A Classification Report showing the per-class Precision, Recall and F1-score 

C=1 
Negative weight = 3.14 

               precision    recall  f1-score   support

     neutral       0.58      0.84      0.69       737
    negative       0.60      0.26      0.36       340
    positive       0.63      0.48      0.55       573

    accuracy                           0.60      1650
   macro avg       0.60      0.53      0.53      1650
weighted avg       0.60      0.60      0.57      1650

A Classificati

# Test

In [32]:
dev_data = clean_df("./data/dataset/twitter-2013test-A.txt")
only_tweet_dev_data = dev_data['tweet']
features_array = get_feature_array(only_tweet_train_data)
dev_adj_feature = get_feature(only_tweet_dev_data, features_array)

dev_labels = dev_data.polarity
result = []
for x in dev_labels:
    if x == "positive":
        result.append(2)
    elif x == "negative":
        result.append(1)
    elif x == "neutral":
        result.append(0)
dev_labels = np.array(result)

scaler = StandardScaler()
dev_adj_feature = scaler.fit_transform(dev_adj_feature)

dev_features = np.array(dev_adj_feature)

print("dev labels: ", dev_labels) 
print("dev features:", dev_features) 
print("dev_labels shape: ", dev_labels.shape) 
print("dev_features shape:", dev_features.shape) 

dev labels:  [2 2 0 ... 1 2 0]
dev features: [[-0.06733396 -0.07902327 -0.05045046 ... -0.02375907  0.
  -0.01679783]
 [-0.06733396 -0.07902327 -0.05045046 ... -0.02375907  0.
  -0.01679783]
 [-0.06733396 -0.07902327 -0.05045046 ... -0.02375907  0.
  -0.01679783]
 ...
 [-0.06733396 -0.07902327 -0.05045046 ... -0.02375907  0.
  -0.01679783]
 [-0.06733396 -0.07902327 -0.05045046 ... -0.02375907  0.
  -0.01679783]
 [-0.06733396 -0.07902327 -0.05045046 ... -0.02375907  0.
  -0.01679783]]
dev_labels shape:  (3545,)
dev_features shape: (3545, 696)


In [19]:

clf = SVC(kernel='linear', C=0.005, probability=True)


clf.fit(train_features, train_labels)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=0.005 \nNegative weight = 3.14 \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))



clf = SVC(kernel='linear', C=1, probability=True)

clf.fit(train_features, train_labels)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=1 \nNegative weight = 3.14 \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))






clf = SVC(kernel='linear', C=1, probability=True)

sample_weight = np.array([3.14 if i == 1 else 1 for i in train_labels])

clf.fit(train_features, train_labels, sample_weight = sample_weight)
# clf.fit(train_features, train_labels)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=1 \nNegative weight = 3.14 \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))






A Classification Report showing the per-class Precision, Recall and F1-score 

C=0.005 
Negative weight = 3.14 

               precision    recall  f1-score   support

     neutral       0.58      0.84      0.68       737
    negative       0.61      0.23      0.33       340
    positive       0.63      0.49      0.55       573

    accuracy                           0.59      1650
   macro avg       0.60      0.52      0.52      1650
weighted avg       0.60      0.59      0.56      1650

A Classification Report showing the per-class Precision, Recall and F1-score 

C=1 
Negative weight = 3.14 

               precision    recall  f1-score   support

     neutral       0.58      0.84      0.69       737
    negative       0.60      0.26      0.36       340
    positive       0.63      0.48      0.55       573

    accuracy                           0.60      1650
   macro avg       0.60      0.53      0.53      1650
weighted avg       0.60      0.60      0.57      1650

A Classificati