# Assignment 3: Sentiment Classification of Tweets

### Data Engineering

In [1]:
# read train data
train_full = open("./data/train_full.csv",'r').readlines()[1:]
train_count = open("./data/train_count.csv",'r').readlines()[1:]
train_tfidf = open("./data/train_tfidf.csv",'r').readlines()[1:]
train_glove = open("./data/train_glove.csv",'r').readlines()[1:]

In [2]:
# read development data
dev_full = open("./data/dev_full.csv",'r').readlines()[1:]
dev_count = open("./data/dev_count.csv",'r').readlines()[1:]
dev_tfidf = open("./data/dev_tfidf.csv",'r').readlines()[1:]
dev_glove = open("./data/dev_glove.csv",'r').readlines()[1:]

In [3]:
# read test data
test_full = open("./data/test_full.csv",'r').readlines()[1:]
test_count = open("./data/test_count.csv",'r').readlines()[1:]
test_tfidf = open("./data/test_tfidf.csv",'r').readlines()[1:]
test_glove = open("./data/test_glove.csv",'r').readlines()[1:]

In [4]:
# extract features from data
def get_feature(data):
    feature = []
    for line in data:
         feature.append(eval(line.strip().split('\"')[1:-1][0]))
    return feature

In [5]:
train_count_fea = get_feature(train_count)
train_tfidf_fea = get_feature(train_tfidf)
train_glove_fea = get_feature(train_glove)

dev_count_fea = get_feature(dev_count)
dev_tfidf_fea = get_feature(dev_tfidf)
dev_glove_fea = get_feature(dev_glove)

test_count_fea = get_feature(test_count)
test_tfidf_fea = get_feature(test_tfidf)
test_glove_fea = get_feature(test_glove)

In [6]:
# make features in tfidf and count have same dimension
vocab = open("./data/vocab.txt",'r').readlines()

def extend_dimension(feature):
    extended_feature = []
    for i in range(len(feature)):
        counts = [0 for j in range(len(vocab))]
        for instance in feature[i]:
            counts[instance[0]] = instance[1]
        extended_feature.append(counts)
    return extended_feature

In [7]:
extended_train_count_fea = extend_dimension(train_count_fea)
extended_dev_count_fea = extend_dimension(dev_count_fea)
extended_test_count_fea = extend_dimension(test_count_fea)

extended_train_tfidf_fea = extend_dimension(train_tfidf_fea)
extended_dev_tfidf_fea = extend_dimension(dev_tfidf_fea)
extended_test_tfidf_fea = extend_dimension(test_tfidf_fea)

In [8]:
# extract labels from data
def get_label(data):
    label = []
    for line in data:
        label.append(line.strip().split(',')[0])
    return label

In [9]:
train_label = get_label(train_count)
dev_label = get_label(dev_count)

### Baseline

In [10]:
from collections import Counter
from random import random
prior_neg = Counter(train_label)["neg"] / len(train_label)
dev_neg = Counter(dev_label)["neg"] / len(dev_label)
prior_neu = Counter(train_label)["neu"] / len(train_label)
dev_neu = Counter(dev_label)["neu"] / len(dev_label)
prior_pos = Counter(train_label)["pos"] / len(train_label)
dev_pos = Counter(dev_label)["pos"] / len(dev_label)

def weight_random(dev_label, prior_neg, prior_neu, prior_pos):
    error = 0 
    error_count = 0
    random_prediction = []
    for i in range(len(dev_label)):
        if random() < prior_neg:
            random_prediction.append("neg")
        elif prior_neg < random() < prior_neg + prior_neu:
            random_prediction.append("neu")
        else:
            random_prediction.append("pos")
    return random_prediction 

In [10]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def evaluate(predict, label):
    acc = accuracy_score(predict, label)
    precision_recall_fscore = precision_recall_fscore_support(label, predict, average='macro')
    return acc, precision_recall_fscore[0], precision_recall_fscore[1], precision_recall_fscore[2]

In [12]:
baseline_score = evaluate(weight_random(dev_label, prior_neg, prior_neu, prior_pos), dev_label)
print("Weighted Random Baseline Score")
print("Accuracy: ", round(baseline_score[0],2))
print("Precision: ", round(baseline_score[1],2))
print("Recall: ", round(baseline_score[2],2))
print("F1 Socre: ", round(baseline_score[3],2))

Weighted Random Baseline Score
Accuracy:  0.38
Precision:  0.34
Recall:  0.34
F1 Socre:  0.33


### Training and Developing

#### Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression

In [21]:
lr_count = LogisticRegression().fit(extended_train_count_fea, train_label)
lr_count_predict = lr_count.predict(extended_dev_count_fea)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [22]:
lr_count_score = evaluate(lr_count_predict, dev_label)
print("Logistic Regression with count feature Score")
print("Accuracy: ", round(lr_count_score[0],2))
print("Precision: ", round(lr_count_score[1],2))
print("Recall: ", round(lr_count_score[2],2))
print("F1 Socre: ", round(lr_count_score[3],2))

Logistic Regression with count feature Score
Accuracy:  0.74
Precision:  0.75
Recall:  0.76
F1 Socre:  0.76


In [16]:
lr_tfidf = LogisticRegression().fit(extended_train_tfidf_fea, train_label)
lr_tfidf_predict = lr_tfidf.predict(extended_dev_tfidf_fea)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
lr_tfidf_score = evaluate(lr_tfidf_predict, dev_label)
print("Logistic Regression with tfidf feature Score")
print("Accuracy: ", round(lr_tfidf_score[0],2))
print("Precision: ", round(lr_tfidf_score[1],2))
print("Recall: ", round(lr_tfidf_score[2],2))
print("F1 Socre: ", round(lr_tfidf_score[3],2))

Logistic Regression with tfidf feature Score
Accuracy:  0.74
Precision:  0.76
Recall:  0.76
F1 Socre:  0.76


In [19]:
lr_glove = LogisticRegression().fit(train_glove_fea, train_label)
lr_glove_predict = lr_glove.predict(dev_glove_fea)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
lr_glove_score = evaluate(lr_glove_predict, dev_label)
print("Logistic Regression with glove feature Score")
print("Accuracy: ", round(lr_glove_score[0],2))
print("Precision: ", round(lr_glove_score[1],2))
print("Recall: ", round(lr_glove_score[2],2))
print("F1 Socre: ", round(lr_glove_score[3],2))

Logistic Regression with glove feature Score
Accuracy:  0.68
Precision:  0.69
Recall:  0.7
F1 Socre:  0.7


#### KNN

In [20]:
from sklearn.neighbors import KNeighborsClassifier

In [33]:
knn_count = KNeighborsClassifier(n_neighbors=23)
knn_count.fit(extended_train_count_fea, train_label)
knn_count_predict = knn_count.predict(extended_dev_count_fea)

In [34]:
knn_count_score = evaluate(knn_count_predict, dev_label)
print("KNN with count feature Score")
print("Accuracy: ", round(knn_count_score[0],2))
print("Precision: ", round(knn_count_score[1],2))
print("Recall: ", round(knn_count_score[2],2))
print("F1 Socre: ", round(knn_count_score[3],2))

KNN with count feature Score
Accuracy:  0.64
Precision:  0.66
Recall:  0.64
F1 Socre:  0.65


In [35]:
knn_tfidf = KNeighborsClassifier(n_neighbors=23)
knn_tfidf.fit(extended_train_tfidf_fea, train_label)
knn_tfidf_predict = knn_tfidf.predict(extended_dev_count_fea)

In [36]:
knn_tfidf_score = evaluate(knn_tfidf_predict, dev_label)
print("KNN with tfidf feature Score")
print("Accuracy: ", round(knn_tfidf_score[0],2))
print("Precision: ", round(knn_tfidf_score[1],2))
print("Recall: ", round(knn_tfidf_score[2],2))
print("F1 Socre: ", round(knn_tfidf_score[3],2))

KNN with tfidf feature Score
Accuracy:  0.66
Precision:  0.69
Recall:  0.66
F1 Socre:  0.67


In [37]:
knn_glove = KNeighborsClassifier(n_neighbors=23)
knn_glove.fit(train_glove_fea, train_label)
knn_glove_predict = knn_glove.predict(dev_glove_fea)

In [38]:
knn_glove_score = evaluate(knn_glove_predict, dev_label)
print("KNN with glove feature Score")
print("Accuracy: ", round(knn_glove_score[0],2))
print("Precision: ", round(knn_glove_score[1],2))
print("Recall: ", round(knn_glove_score[2],2))
print("F1 Socre: ", round(knn_glove_score[3],2))

KNN with glove feature Score
Accuracy:  0.63
Precision:  0.7
Recall:  0.63
F1 Socre:  0.65


In [30]:
## test the influce of parameter k to knn algorithm
def knn_predict(test_fea, train_fea, train_label, k):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(train_fea, train_label)
    predict = knn.predict(test_fea)
    return predict

In [32]:
best_score = 0.0
best_k = 0
for k in range(1, 25):
    y_predict = knn_predict(dev_glove_fea, train_glove_fea, train_label, k)
    score = sum(y_predict == dev_label) / len(dev_label)
    if score > best_score:
        best_score = score
        best_k = k
print("best score is: ", best_score)
print("best k is: ", best_k)

best score is:  0.6315683713453231
best k is:  23


#### Decision Tree 

In [35]:
from sklearn.tree import DecisionTreeClassifier

In [48]:
dt_count = DecisionTreeClassifier(max_depth=9)
dt_count.fit(extended_train_count_fea, train_label)
dt_count_predict = dt_count.predict(extended_dev_count_fea)

In [49]:
dt_count_score = evaluate(dt_count_predict, dev_label)
print("Decision Tree with count feature Score")
print("Accuracy: ", round(dt_count_score[0],2))
print("Precision: ", round(dt_count_score[1],2))
print("Recall: ", round(dt_count_score[2],2))
print("F1 Socre: ", round(dt_count_score[3],2))

Decision Tree with count feature Score
Accuracy:  0.46
Precision:  0.57
Recall:  0.4
F1 Socre:  0.35


In [50]:
dt_tfidf = DecisionTreeClassifier(max_depth=9)
dt_tfidf.fit(extended_train_tfidf_fea, train_label)
dt_tfidf_predict = dt_tfidf.predict(extended_dev_count_fea)

In [51]:
dt_tfidf_score = evaluate(dt_tfidf_predict, dev_label)
print("Decision Tree with tfidf feature Score")
print("Accuracy: ", round(dt_tfidf_score[0],2))
print("Precision: ", round(dt_tfidf_score[1],2))
print("Recall: ", round(dt_tfidf_score[2],2))
print("F1 Socre: ", round(dt_tfidf_score[3],2))

Decision Tree with tfidf feature Score
Accuracy:  0.46
Precision:  0.57
Recall:  0.4
F1 Socre:  0.35


In [52]:
dt_glove = DecisionTreeClassifier(max_depth=9)
dt_glove.fit(train_glove_fea, train_label)
dt_glove_predict = dt_glove.predict(dev_glove_fea)

In [53]:
dt_glove_score = evaluate(dt_glove_predict, dev_label)
print("Decision Tree with glove feature Score")
print("Accuracy: ", round(dt_glove_score[0],2))
print("Precision: ", round(dt_glove_score[1],2))
print("Recall: ", round(dt_glove_score[2],2))
print("F1 Socre: ", round(dt_glove_score[3],2))

Decision Tree with glove feature Score
Accuracy:  0.59
Precision:  0.62
Recall:  0.6
F1 Socre:  0.61


In [33]:
## test the influce of parameter max_depth to dt algorithm
def dt(test_fea, train_fea, train_label, k):
    dt = DecisionTreeClassifier(max_depth=k)
    dt.fit(extended_train_count_fea, train_label)
    predict = dt.predict(extended_dev_count_fea)
    return predict

In [36]:
best_score = 0.0
best_k = 0
for k in range(1, 10):
    y_predict = dt(dev_glove_fea, train_glove_fea, train_label, k)
    score = sum(y_predict == dev_label) / len(dev_label)
    if score > best_score:
        best_score = score
        best_k = k
print("best score is: ", best_score)
print("best k is: ", best_k)

best score is:  0.4629759871395559
best k is:  9


#### Multi-layer Perceptron

In [11]:
from sklearn.neural_network import MLPClassifier

In [26]:
mlp_count = MLPClassifier(solver='adam', activation='tanh', hidden_layer_sizes=(2,2))
mlp_count.fit(extended_train_count_fea, train_label)
mlp_count_predict = mlp_count.predict(extended_dev_count_fea)

In [27]:
mlp_count_score = evaluate(mlp_count_predict, dev_label)
print("Multi-layer Perceptron with count feature Score")
print("Accuracy: ", round(mlp_count_score[0],2))
print("Precision: ", round(mlp_count_score[1],2))
print("Recall: ", round(mlp_count_score[2],2))
print("F1 Socre: ", round(mlp_count_score[3],2))

Multi-layer Perceptron with count feature Score
Accuracy:  0.74
Precision:  0.75
Recall:  0.75
F1 Socre:  0.75


In [28]:
mlp_tfidf = MLPClassifier(solver='adam', activation='tanh', hidden_layer_sizes=(2,2))
mlp_tfidf.fit(extended_train_tfidf_fea, train_label)
mlp_tfidf_predict = mlp_tfidf.predict(extended_dev_count_fea)

In [29]:
mlp_tfidf_score = evaluate(mlp_tfidf_predict, dev_label)
print("Multi-layer Perceptron with tfidf feature Score")
print("Accuracy: ", round(mlp_tfidf_score[0],2))
print("Precision: ", round(mlp_tfidf_score[1],2))
print("Recall: ", round(mlp_tfidf_score[2],2))
print("F1 Socre: ", round(mlp_tfidf_score[3],2))

Multi-layer Perceptron with tfidf feature Score
Accuracy:  0.72
Precision:  0.75
Recall:  0.73
F1 Socre:  0.74


In [30]:
mlp_glove = MLPClassifier(solver='adam', activation='tanh', hidden_layer_sizes=(2,2))
mlp_glove.fit(train_glove_fea, train_label)
mlp_glove_predict = mlp_glove.predict(dev_glove_fea)

In [31]:
mlp_glove_score = evaluate(mlp_glove_predict, dev_label)
print("Multi-layer Perceptron with glove feature Score")
print("Accuracy: ", round(mlp_glove_score[0],2))
print("Precision: ", round(mlp_glove_score[1],2))
print("Recall: ", round(mlp_glove_score[2],2))
print("F1 Socre: ", round(mlp_glove_score[3],2))

Multi-layer Perceptron with glove feature Score
Accuracy:  0.68
Precision:  0.7
Recall:  0.7
F1 Socre:  0.7


#### Predict the text value

In [63]:
lr_test = LogisticRegression().fit(extended_train_tfidf_fea, train_label)
lr_test_predict = lr_test.predict(extended_test_tfidf_fea)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [87]:
import pandas as pd
df = pd.DataFrame(columns=['tweet_id', 'sentiment'])

In [81]:
tweet = open("./data/test_full.csv",'r').readlines()[1:]

In [82]:
tweet_id = []
for line in tweet:
    tweet_id.append(eval(line.strip().split(",")[1]))

In [88]:
df["tweet_id"] = tweet_id
df["sentiment"] = lr_test_predict

In [90]:
df.to_csv('result.csv', index=False)