In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import SVR

from datasets import Dataset
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from torch.utils.data import DataLoader
import torch
import tqdm
from transformers import TrainingArguments, Trainer

In [2]:
def compute_metrics_for_regression_sklearn(y_pred, y_true):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    single_squared_errors = ((y_pred - y_true)**2).tolist()

    # Compute accuracy
    # Based on the fact that the rounded score = true score only if |single_squared_errors| < 0.5
    accuracy = sum([1 for e in single_squared_errors if e < 0.25]) / len(single_squared_errors)

    return {"mse": mse, "mae": mae, "r2": r2, "accuracy": accuracy}


def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)

    mse = mean_squared_error(labels, logits)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)
    single_squared_errors = ((logits - labels).flatten()**2).tolist()

    # Compute accuracy
    # Based on the fact that the rounded score = true score only if |single_squared_errors| < 0.5
    accuracy = sum([1 for e in single_squared_errors if e < 0.25]) / len(single_squared_errors)

    return {"mse": mse, "mae": mae, "r2": r2, "accuracy": accuracy}

## Load Data and Manipulate it

In [3]:
lens = 0

base_path = "semeval-2017-tweets_Subtask-A/downloaded/"
base_df = pd.DataFrame()
colnames=['id', 'sentiment', 'tweet']
for df_path in os.listdir(base_path):
    path = os.path.join(base_path, df_path)
    print(df_path)
    df = pd.read_csv(path, sep="\t", header=None)
    if df.shape[1] > 3:
        df = df.iloc[:, 0:3]
    df.columns = colnames
    print(df["sentiment"].value_counts())
    print("================")
    base_df = pd.concat([base_df, df], ignore_index=True)
    lens += len(df)

twitter-2016dev-A.tsv
sentiment
positive    829
neutral     746
negative    391
Name: count, dtype: int64
twitter-2016test-A.tsv
sentiment
neutral     10342
positive     7059
negative     3231
Name: count, dtype: int64
twitter-2013train-A.tsv
sentiment
neutral     4586
positive    3640
negative    1458
Name: count, dtype: int64
twitter-2016train-A.tsv
sentiment
positive    3017
neutral     2001
negative     850
Name: count, dtype: int64
twitter-2015test-A.tsv
sentiment
positive    1038
neutral      987
negative     365
Name: count, dtype: int64
twitter-2015train-A.tsv
sentiment
neutral     253
positive    170
negative     66
Name: count, dtype: int64
twitter-2013dev-A.tsv
sentiment
neutral     739
positive    575
negative    340
Name: count, dtype: int64
twitter-2016devtest-A.tsv
sentiment
positive    994
neutral     681
negative    325
Name: count, dtype: int64
twitter-2013test-A.tsv
sentiment
neutral     1513
positive    1475
negative     559
Name: count, dtype: int64
twitter-2014sar

In [4]:
base_df

Unnamed: 0,id,sentiment,tweet
0,638060586258038784,neutral,05 Beat it - Michael Jackson - Thriller (25th ...
1,638061181823922176,positive,Jay Z joins Instagram with nostalgic tribute t...
2,638083821364244480,neutral,Michael Jackson: Bad 25th Anniversary Edition ...
3,638091450132078593,positive,I liked a @YouTube video http://t.co/AaR3pjp2P...
4,638125563790557184,positive,18th anniv of Princess Diana's death. I still ...
...,...,...,...
50127,210378118865756160,neutral,It's a Wednesday girls night out as '90's band...
50128,245177521304399872,positive,"night college course sorted, just have to enro..."
50129,259280987089932288,positive,For the 1st time in 30 years. For your splendi...
50130,201113950211940352,positive,NURSES DAY - 12 MAY 2012. Nursing: The heart b...


In [5]:
base_df.drop_duplicates(inplace=True)

In [6]:
base_df

Unnamed: 0,id,sentiment,tweet
0,638060586258038784,neutral,05 Beat it - Michael Jackson - Thriller (25th ...
1,638061181823922176,positive,Jay Z joins Instagram with nostalgic tribute t...
2,638083821364244480,neutral,Michael Jackson: Bad 25th Anniversary Edition ...
3,638091450132078593,positive,I liked a @YouTube video http://t.co/AaR3pjp2P...
4,638125563790557184,positive,18th anniv of Princess Diana's death. I still ...
...,...,...,...
50127,210378118865756160,neutral,It's a Wednesday girls night out as '90's band...
50128,245177521304399872,positive,"night college course sorted, just have to enro..."
50129,259280987089932288,positive,For the 1st time in 30 years. For your splendi...
50130,201113950211940352,positive,NURSES DAY - 12 MAY 2012. Nursing: The heart b...


In [7]:
X = base_df["tweet"]
y = base_df["sentiment"]

In [8]:
y.value_counts()

sentiment
neutral     22182
positive    19572
negative     7713
Name: count, dtype: int64

In [9]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, stratify=y_train_val, random_state=42, shuffle=True)

In [10]:
len(X_train), len(X_val), len(X_test)

(31658, 7915, 9894)

In [11]:
y_train = y_train.map({"negative": 0, "neutral": 1, "positive":2})
y_val = y_val.map({"negative": 0, "neutral": 1, "positive":2})
y_test = y_test.map({"negative": 0, "neutral": 1, "positive":2})

## Baseline Model

In [12]:
count_vect = CountVectorizer(stop_words="english", lowercase=True, ngram_range=(1, 3), analyzer="word")
X_train_counts = count_vect.fit_transform(X_train)
X_val_counts = count_vect.transform(X_val)
X_test_counts = count_vect.transform(X_test)

In [13]:
tfidf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tfidf = tfidf_transformer.transform(X_train_counts)
X_val_tfidf = tfidf_transformer.transform(X_val_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [14]:
svr = SVR()
svr.fit(X_train_tfidf, y_train)

In [15]:
y_val_pred_svr = svr.predict(X_val_tfidf)
r2_score(y_val, y_val_pred_svr)

0.29214176623090515

In [16]:
compute_metrics_for_regression_sklearn(y_val_pred_svr, y_val)

{'mse': 0.34975831391079454,
 'mae': 0.4757609094362657,
 'r2': 0.29214176623090515,
 'accuracy': 0.6068224889450411}

In [17]:
y_test_pred_svr = svr.predict(X_test_tfidf)
r2_score(y_test, y_test_pred_svr)

0.27822487999671797

In [18]:
compute_metrics_for_regression_sklearn(y_test_pred_svr, y_test)

{'mse': 0.35664278104756786,
 'mae': 0.4814279799145648,
 'r2': 0.27822487999671797,
 'accuracy': 0.6059227814837275}