In [18]:
import numpy as np
import spacy
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pymorphy2
import re
import functools
import ssl

In [19]:
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
en_nlp = spacy.load('en_core_web_sm')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /Users/beast-sl/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/beast-
[nltk_data]     sl/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/beast-
[nltk_data]     sl/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/beast-
[nltk_data]     sl/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [21]:
train = pd.read_csv('../datasets/nsdc_without_neutrality/train.csv')
val = pd.read_csv('../datasets/nsdc_without_neutrality/val.csv')
test = pd.read_csv('../datasets/nsdc_without_neutrality/test.csv')

In [22]:
print("Constant baseline")
val_preds = [1] * len(val)
print(f"Val: F1 {f1_score(val['score'], val_preds, average='macro')}, Accuracy {accuracy_score(val['score'], val_preds)}")
test_preds = [1] * len(test)
print(f"Test: F1 {f1_score(test['score'], test_preds, average='macro')}, Accuracy {accuracy_score(test['score'], test_preds)}")

Constant baseline
Val: F1 0.28365960846983623, Accuracy 0.7406119610570236
Test: F1 0.28358367955683395, Accuracy 0.7402669632925473


In [23]:
def preprocess(texts):
    m = pymorphy2.MorphAnalyzer()
    mystopwords = stopwords.words('english') 

    def tokenize(text):
        regex = re.compile("[A-Za-z]+")
        try:
            return regex.findall(text.lower())
        except:
            return []

    @functools.lru_cache(maxsize=128)
    def lemmatize_word(token, pymorphy=m):
        return pymorphy.parse(token)[0].normal_form

    def lemmatize_text(text):
        return [lemmatize_word(w) for w in text]

    def remove_stopwords(lemmas, stopwords = mystopwords):
        return [w for w in lemmas if not w in stopwords]

    preprocessed_texts = []
    corpus = texts
    for sample in corpus:
        text = ""
        label = 0
        text = sample
        preprocessed_text = remove_stopwords(lemmatize_text(tokenize(text)))
        preprocessed_texts.append(preprocessed_text)
    return np.array(preprocessed_texts)

In [24]:
X_train = preprocess(list(train['text']))
y_train = train['score']
X_val = preprocess(list(val['text']))
y_val = train['score']
X_test = preprocess(list(test['text']))
y_test = train['score']
vectorizer = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x)
train_tf_idf = vectorizer.fit_transform(X_train)
val_tf_idf = vectorizer.transform(X_val)
test_tf_idf = vectorizer.transform(X_test)
model = LogisticRegression(max_iter=1000)
model.fit(train_tf_idf, y_train)
val_preds = model.predict(val_tf_idf)
test_preds = model.predict(test_tf_idf)
print("TF-IDF + LogReg baseline")
print(f"Val: F1 {f1_score(val['score'], val_preds, average='macro')}, Accuracy {accuracy_score(val['score'], val_preds)}")
print(f"Test: F1 {f1_score(test['score'], test_preds, average='macro')}, Accuracy {accuracy_score(test['score'], test_preds)}")

  return np.array(preprocessed_texts)


TF-IDF + LogReg baseline
Val: F1 0.4838583553503856, Accuracy 0.7739916550764951
Test: F1 0.42707021296056796, Accuracy 0.7627919911012235
