In [1]:
import os
import re

import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn import svm

In [2]:
df = pd.read_csv("../input/twitter_sentiment_data.csv")
df.drop(columns=["tweetid"])
df = df.drop(df[df.sentiment == 2].index)

train, validate, test = np.split(df.sample(frac=1, random_state=0), [int(.6*len(df)), int(.8*len(df))])
train_polar = train.drop(train[train.sentiment == 0].index)
validate_polar = validate.drop(validate[validate.sentiment == 0].index)
test_polar = test.drop(test[test.sentiment == 0].index)

In [3]:
if not os.path.isfile(f"model/config.json"):
    new_model = SentenceTransformer("all-MiniLM-L12-v2")
    new_model.save("model/")
transformer = SentenceTransformer.load("model/")

In [4]:
def preprocess_dataframe(dataframe):
    messages = dataframe["message"].values
    processed = []
    for message in messages:
        remove_retweets = re.sub(r"RT @\w+:", "", message)
        remove_ats = re.sub(r"@\w+:", "", remove_retweets)
        remove_hashtags = re.sub(r"#", "", remove_ats)
        processed.append(remove_hashtags)
    encoded = transformer.encode(messages)
    dataframe["encoded"] = encoded.tolist()

In [5]:
preprocess_dataframe(train)
preprocess_dataframe(validate)
preprocess_dataframe(train_polar)
preprocess_dataframe(validate_polar)

In [None]:
print("polynomial kernel SVC:")
model = svm.SVC(kernel="poly")
model.fit(train["encoded"].tolist(), train["sentiment"])

model_polar = svm.SVC(kernel="poly")
model_polar.fit(train_polar["encoded"].tolist(), train_polar["sentiment"])

polynomial kernel SVC:


In [7]:
print("score when including neutral in train and validate")
print(model.score(validate["encoded"].tolist(), validate["sentiment"]))
print("score when including neutral in train only")
print(model.score(validate_polar["encoded"].tolist(), validate_polar["sentiment"]))
print("score when excluding neutral")
print(model_polar.score(validate_polar["encoded"].tolist(), validate_polar["sentiment"]))

score when including neutral in train and validate
0.7921534689167748
score when including neutral in train only
0.8718278778962854
score when excluding neutral
0.9157778595071717
