In [None]:
from collections import OrderedDict, defaultdict, Counter
import re
import pandas as pd
from nltk.tokenize import TweetTokenizer
import nltk
import numpy as np
from nltk.corpus import stopwords
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
import csv

In [None]:
#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tweet_tokenizer = TweetTokenizer()
    stopword_list=nltk.corpus.stopwords.words('english')
    tokens = tweet_tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [None]:
def clean_df(file):
    data = pd.read_csv(file, sep='\t', names=["id", "polarity", "tweet"])
    data = data.drop_duplicates()
    data['tweet']=data['tweet'].apply(remove_stopwords)
    data["tweet"] = data["tweet"].str.lower() # lowercase
    data = data.reset_index(drop=True)
    return data

In [None]:
def generate_bin_count(tweet, tokenizer):
    bin_count = Counter()
    for token in tokenizer.tokenize(tweet):
        token = token.lower()
        bin_count += Counter(word_dict[token])
    return bin_count

In [None]:
def get_feature(only_tweet_data, tokenizer):
    bin_counts = [generate_bin_count(tweet, tokenizer) for tweet in only_tweet_data]
    bin_df = pd.DataFrame(bin_counts, index=only_tweet_data.index)
    bin_df = bin_df.fillna(0)
    bin_np = bin_df.to_numpy()
    return bin_np

In [None]:
neg_list = []
pos_list = []
word_dict = defaultdict(list)
with open('./data/Bing Liu -lexicon-English/positive-words.txt', 'r', encoding='unicode_escape') as f:
    reader = csv.reader(f)
    headerRows = [i for i in range(0, 30)]
    for row in headerRows:
        next(reader)
    for word in reader:
        pos_list.extend(word[0])
        word_dict[word[0]].append('positive')

with open('./data/Bing Liu -lexicon-English/negative-words.txt', 'r', encoding='unicode_escape') as f:
    reader = csv.reader(f)
    headerRows = [i for i in range(0, 31)]
    for row in headerRows:
        next(reader)
    for word in reader:
        neg_list.extend(word[0])
        word_dict[word[0]].append('negative')

In [None]:
tweet_tokenizer = TweetTokenizer()
train_data = clean_df("./data/dataset/twitter-2013train-A.txt")
only_tweet_train_data = train_data['tweet']
train_bing_liu_feature = get_feature(only_tweet_train_data, tweet_tokenizer)

train_labels = train_data.polarity
result = []
for x in train_labels:
    if x == "positive":
        result.append(2)
    elif x == "negative":
        result.append(1)
    elif x == "neutral":
        result.append(0)
train_labels = np.array(result)
scaler = StandardScaler()
train_bing_liu_feature = scaler.fit_transform(train_bing_liu_feature)
train_features = np.array(train_bing_liu_feature)

print("train labels: ", train_labels) 
print("train features:", train_features) 
print("train labels shape: ", train_labels.shape) 
print("train features shape:", train_features.shape)

In [None]:
tweet_tokenizer = TweetTokenizer()
data_path = "./data/dataset/twitter-2013dev-A.txt"
dev_data = clean_df(data_path)
only_tweet_dev_data = dev_data['tweet']
dev_bing_liu_feature = get_feature(only_tweet_dev_data, tweet_tokenizer)

dev_labels = dev_data.polarity
result = []
for x in dev_labels:
    if x == "positive":
        result.append(2)
    elif x == "negative":
        result.append(1)
    elif x == "neutral":
        result.append(0)
dev_labels = np.array(result)
scaler = StandardScaler()
dev_bing_liu_feature = scaler.fit_transform(dev_bing_liu_feature)
dev_features = np.array(dev_bing_liu_feature)

print("dev labels: ", dev_labels) 
print("dev features:", dev_features) 
print("dev_labels shape: ", dev_labels.shape) 
print("dev_features shape:", dev_features.shape) 

In [None]:
clf = SVC(kernel='linear', C=0.005, probability=True)

sample_weight = np.array([3.14 if i == 1 else 1 for i in train_labels])

clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))

In [None]:
clf = SVC(kernel='linear', C=1, probability=True)

sample_weight = np.array([3.14 if i == 1 else 1 for i in train_labels])

clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))

In [None]:
clf = SVC(kernel='linear', C=0.005, probability=True)

arr = []

for i in train_labels:
    if i == 1:
        arr.append(3.14)
    elif i == 2:
        arr.append(1.25)
    else:
        arr.append(1)

sample_weight = np.array(arr)
clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))

In [None]:
clf = SVC(kernel='linear', C=1, probability=True)

arr = []

for i in train_labels:
    if i == 1:
        arr.append(3.14)
    elif i == 2:
        arr.append(1.25)
    else:
        arr.append(1)


sample_weight = np.array(arr)
clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))

# Test

In [None]:
tweet_tokenizer = TweetTokenizer()
data_path = "./data/dataset/twitter-2013test-A.txt"
dev_data = clean_df(data_path)
only_tweet_dev_data = dev_data['tweet']
dev_bing_liu_feature = get_feature(only_tweet_dev_data, tweet_tokenizer)

dev_labels = dev_data.polarity
result = []
for x in dev_labels:
    if x == "positive":
        result.append(2)
    elif x == "negative":
        result.append(1)
    elif x == "neutral":
        result.append(0)
dev_labels = np.array(result)
scaler = StandardScaler()
dev_bing_liu_feature = scaler.fit_transform(dev_bing_liu_feature)
dev_features = np.array(dev_bing_liu_feature)

print("dev labels: ", dev_labels) 
print("dev features:", dev_features) 
print("dev_labels shape: ", dev_labels.shape) 
print("dev_features shape:", dev_features.shape) 

In [None]:
file = open("bing_liu_2_features_vector.txt", "w+")
for i in dev_bing_liu_feature:
    content = str(i)
    file.write(content)
file.close()

In [None]:
clf = SVC(kernel='linear', C=0.005, probability=True)

sample_weight = np.array([3.14 if i == 1 else 1 for i in train_labels])

clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))

In [None]:
clf = SVC(kernel='linear', C=1, probability=True)

sample_weight = np.array([3.14 if i == 1 else 1 for i in train_labels])

clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))

In [None]:
clf = SVC(kernel='linear', C=0.005, probability=True)

arr = []

for i in train_labels:
    if i == 1:
        arr.append(3.14)
    elif i == 2:
        arr.append(1.25)
    else:
        arr.append(1)


sample_weight = np.array(arr)
clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))

In [None]:
clf = SVC(kernel='linear', C=1, probability=True)

arr = []

for i in train_labels:
    if i == 1:
        arr.append(3.14)
    elif i == 2:
        arr.append(1.25)
    else:
        arr.append(1)


sample_weight = np.array(arr)
clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))

In [None]:
from sklearn.model_selection import GridSearchCV
  
param_grid = {'C': [0.005, 0.1, 0.5, 1], 
              'kernel': ['linear','rbf']} 
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
  
grid.fit(train_features, train_labels)

print(grid.best_params_)
  
print(grid.best_estimator_)

grid_predictions = grid.predict(dev_features)
  
print(classification_report(dev_labels, grid_predictions))