In [None]:
from collections import OrderedDict, defaultdict, Counter
import re
import pandas as pd
from nltk.tokenize import TweetTokenizer
import nltk
import numpy as np
from nltk.corpus import stopwords
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from nltk.util import ngrams

In [None]:
#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tweet_tokenizer = TweetTokenizer()
    stopword_list=nltk.corpus.stopwords.words('english')
    tokens = tweet_tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [None]:
def clean_df(file):
    data = pd.read_csv(file, sep='\t', names=["id", "polarity", "tweet"])
    data = data.drop_duplicates()
    data['tweet']=data['tweet'].apply(remove_stopwords)
    data["tweet"] = data["tweet"].str.lower() # lowercase
    data = data.reset_index(drop=True)
    return data

In [None]:
def polarity(n_gram):
    score = word_dict[n_gram]
    if score > 0:
        return 'positive'
    if score < 0:
        return 'negative'
    else:
        return 'none'

In [None]:
def count_polarity_tokens(tweet, tokenizer):
    
    score_list = []
    tokenized = tokenizer.tokenize(tweet)
    ngrams_list = [' '.join(i) for i in ngrams(tokenized, 2)]
    for ngram in ngrams_list:
        ngram = ngram.lower()
        score = polarity(ngram)
        scorelist.append(score)
        
    return dict(Counter(score_list))

In [None]:
def sum_polarity(tweet, tokenizer):
    neg_list = []
    pos_list = []
    tokenized = tokenizer.tokenize(tweet)
    ngrams_list = [' '.join(i) for i in ngrams(tokenized, 2)]
    for ngram in ngrams_list:
        ngram = ngram.lower()
        if polarity(ngram) == 'positive':
            pos_list.append(word_dict[ngram])
        elif polarity(ngram) == 'negative':
            neg_list.append(abs(word_dict[ngram]))
        
    return {'pos_sum' : sum(pos_list), 'neg_sum' : sum(neg_list)}

In [None]:
def max_token(tweet, tokenizer):
    
    neg_list = []
    pos_list = []
    
    tokenized = tokenizer.tokenize(tweet)
    ngrams_list = [' '.join(i) for i in ngrams(tokenized, 2)]
    for ngram in ngrams_list:
        ngram = ngram.lower()
        if polarity(ngram) == 'positive':
            pos_list.append(word_dict[ngram])
        elif polarity(ngram) == 'negative':
            neg_list.append(word_dict[ngram])

    try:
        pos_max = max(pos_list)
    except ValueError:
        pos_max = 0
    try:
        neg_max = min(neg_list)
    except ValueError:
        neg_max = 0
        
    return {'pos_max' : pos_max, 'neg_max' : neg_max}

In [None]:
def last_token(tweet, tokenizer):
    tokenized = tokenizer.tokenize(tweet)
    ngrams_list = [' '.join(i) for i in ngrams(tokenized, 2)]
    
    for token in reversed(ngrams_list):
        token = token.lower()
        if polarity(token) == 'positive' or polarity(token) == 'negative':
            return {'last_polarity' : word_dict[token]}
        else:
            continue
    
    return {'last_polarity' : 0}

In [None]:
def all_feats(tweet, tokenizer):
    count_tokens = count_polarity_tokens(tweet, tokenizer)
    polarity = sum_polarity(tweet, tokenizer)
    max_tok = max_token(tweet, tokenizer)
    last_tok = last_token(tweet, tokenizer)
    
    result = dict()
    for dictionary in [count_tokens, polarity, max_tok, last_tok]:
        result.update(dictionary)
    return result

In [None]:
def get_feature(only_tweet_data, tokenizer):
    sentiment140_counts = [all_feats(tweet, tokenizer) for tweet in only_tweet_data]
    sentiment140_df = pd.DataFrame(sentiment140_counts, index=only_tweet_data.index)
    sentiment140_df = sentiment140_df.fillna(0)
    sentiment140_np = sentiment140_df.to_numpy()
    return sentiment140_np

In [None]:
word_dict = defaultdict(float)

with open('./data/Sentiment140-Lexicon-v0.1/bigrams-pmilexicon.txt', 'r') as f:
    for row in f.readlines():
        row = row.split()
        word_dict[row[0] +" " + row[1]] = float(row[2])

In [None]:
tweet_tokenizer = TweetTokenizer()
train_data = clean_df("./data/dataset/twitter-2013train-A.txt")
only_tweet_train_data = train_data['tweet']
train_Sentiment140_bi_feature = get_feature(only_tweet_train_data, tweet_tokenizer)

train_labels = train_data.polarity
result = []
for x in train_labels:
    if x == "positive":
        result.append(2)
    elif x == "negative":
        result.append(1)
    elif x == "neutral":
        result.append(0)
train_labels = np.array(result)
scaler = StandardScaler()
train_Sentiment140_bi_feature = scaler.fit_transform(train_Sentiment140_bi_feature)
train_features = np.array(train_Sentiment140_bi_feature)

print("train labels: ", train_labels) 
print("train features:", train_features) 
print("train labels shape: ", train_labels.shape) 
print("train features shape:", train_features.shape)

In [None]:
tweet_tokenizer = TweetTokenizer()
dev_data = clean_df("./data/dataset/twitter-2013test-A.txt")

only_tweet_dev_data = dev_data['tweet']
dev_Sentiment140_bi_feature = get_feature(only_tweet_dev_data, tweet_tokenizer)

dev_labels = dev_data.polarity
result = []
for x in dev_labels:
    if x == "positive":
        result.append(2)
    elif x == "negative":
        result.append(1)
    elif x == "neutral":
        result.append(0)
dev_labels = np.array(result)
scaler = StandardScaler()
dev_Sentiment140_bi_feature = scaler.fit_transform(dev_Sentiment140_bi_feature)
dev_features = np.array(dev_Sentiment140_bi_feature)

print("dev labels: ", dev_labels) 
print("dev features:", dev_features) 
print("dev_labels shape: ", dev_labels.shape) 
print("dev_features shape:", dev_features.shape) 

In [None]:

file = open("sentiment140_bigram_vector.txt", "w+")
for i in dev_Sentiment140_bi_feature:
    content = str(i)
    file.write(content)
file.close()

In [None]:

clf = SVC(kernel='linear', C=0.005, probability=True)

sample_weight = np.array([3.14 if i == 1 else 1 for i in train_labels])

clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=0.005 \nNegative weight = 3.14 \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))




clf = SVC(kernel='linear', C=1, probability=True)

sample_weight = np.array([3.14 if i == 1 else 1 for i in train_labels])

clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=1 \nNegative weight = 3.14 \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))


clf = SVC(kernel='linear', C=0.005, probability=True)

arr = []

for i in train_labels:
    if i == 1:
        arr.append(3.14)
    elif i == 2:
        arr.append(1.25)
    else:
        arr.append(1)


sample_weight = np.array(arr)
clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=0.005 \nNegative weight = 3.14 \nPositive weight = 1.25 \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))


clf = SVC(kernel='linear', C=1, probability=True)

arr = []

for i in train_labels:
    if i == 1:
        arr.append(3.14)
    elif i == 2:
        arr.append(1.25)
    else:
        arr.append(1)


sample_weight = np.array(arr)
clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=1 \nNegative weight = 3.14 \nPositive weight = 1.25 \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))



clf = SVC(kernel='linear', C=1, probability=True)

clf.fit(train_features, train_labels)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=1 \nNo weight", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))


clf = SVC(kernel='linear', C=0.005, probability=True)

clf.fit(train_features, train_labels)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=0.005 \nNo weight", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))


from sklearn.model_selection import GridSearchCV
  
param_grid = {'C': [0.005, 0.1, 0.5, 1], 
              'kernel': ['linear','rbf']} 
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
  
grid.fit(train_features, train_labels)

print(grid.best_params_)
  
print(grid.best_estimator_)

grid_predictions = grid.predict(dev_features)
  
print(classification_report(dev_labels, grid_predictions))

In [None]:
tweet_tokenizer = TweetTokenizer()
dev_data = clean_df("./data/dataset/twitter-2013dev-A.txt")

only_tweet_dev_data = dev_data['tweet']
dev_Sentiment140_bi_feature = get_feature(only_tweet_dev_data, tweet_tokenizer)

dev_labels = dev_data.polarity
result = []
for x in dev_labels:
    if x == "positive":
        result.append(2)
    elif x == "negative":
        result.append(1)
    elif x == "neutral":
        result.append(0)
dev_labels = np.array(result)
scaler = StandardScaler()
dev_Sentiment140_bi_feature = scaler.fit_transform(dev_Sentiment140_bi_feature)
dev_features = np.array(dev_Sentiment140_bi_feature)

print("dev labels: ", dev_labels) 
print("dev features:", dev_features) 
print("dev_labels shape: ", dev_labels.shape) 
print("dev_features shape:", dev_features.shape) 

In [None]:

clf = SVC(kernel='linear', C=0.005, probability=True)

sample_weight = np.array([3.14 if i == 1 else 1 for i in train_labels])

clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=0.005 \nNegative weight = 3.14 \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))




clf = SVC(kernel='linear', C=1, probability=True)

sample_weight = np.array([3.14 if i == 1 else 1 for i in train_labels])

clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=1 \nNegative weight = 3.14 \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))


clf = SVC(kernel='linear', C=0.005, probability=True)

arr = []

for i in train_labels:
    if i == 1:
        arr.append(3.14)
    elif i == 2:
        arr.append(1.25)
    else:
        arr.append(1)


sample_weight = np.array(arr)
clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=0.005 \nNegative weight = 3.14 \nPositive weight = 1.25 \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))


clf = SVC(kernel='linear', C=1, probability=True)

arr = []

for i in train_labels:
    if i == 1:
        arr.append(3.14)
    elif i == 2:
        arr.append(1.25)
    else:
        arr.append(1)


sample_weight = np.array(arr)
clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=1 \nNegative weight = 3.14 \nPositive weight = 1.25 \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))



clf = SVC(kernel='linear', C=1, probability=True)

clf.fit(train_features, train_labels)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=1 \nNo weight", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))


clf = SVC(kernel='linear', C=0.005, probability=True)

clf.fit(train_features, train_labels)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=0.005 \nNo weight", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))


from sklearn.model_selection import GridSearchCV
  
param_grid = {'C': [0.005, 0.1, 0.5, 1], 
              'kernel': ['linear','rbf']} 
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
  
grid.fit(train_features, train_labels)

print(grid.best_params_)
  
print(grid.best_estimator_)

grid_predictions = grid.predict(dev_features)
  
print(classification_report(dev_labels, grid_predictions))