In [1]:
import re
import sklearn
import pandas as pd
import numpy as np

import nltk 

from pathlib import Path

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings("ignore")

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
#clean the data to apply baseline algorithms
stemmer = SnowballStemmer("german")
stop_words = set(stopwords.words("german"))


def clean_text(text):
    white_space = re.compile(r"\s+", re.IGNORECASE)
    tags = re.compile(r"<[^>]+>")
    ascii = re.compile(r"[^A-Za-zÀ-ž ]", re.IGNORECASE)
    single_char = re.compile(r"\b[A-Za-zÀ-ž]\b", re.IGNORECASE)

    text = re.sub(tags, " ", text)
    text = re.sub(ascii, " ", text)
    text = re.sub(single_char, " ", text)
    text = re.sub(white_space, " ", text)

    word_tokens = word_tokenize(text)
    words_tokens_lower = [word.lower() for word in word_tokens]


    words_filtered = [
            stemmer.stem(word) for word in words_tokens_lower if word not in stop_words
    ]

    text_clean = " ".join(words_filtered)
    return text_clean

In [3]:
train = pd.read_csv('../data/gold/train800.csv')
test = pd.read_csv('../data/gold/test200.csv')

In [4]:
train["comment_clean"] = train.loc[train["Comment"].str.len() > 10, "Comment"]
train["comment_clean"] = train["comment_clean"].map(
    lambda x: clean_text(x) if isinstance(x, str) else x
)

test["comment_clean"] = test.loc[test["Comment"].str.len() > 10, "Comment"]
test["comment_clean"] = test["comment_clean"].map(
    lambda x: clean_text(x) if isinstance(x, str) else x
)

In [None]:
#create td-if matrix
temp = pd.concat([train,test])
vectorizer = TfidfVectorizer(
    analyzer="word", max_df=0.3, min_df=10, ngram_range=(1, 2), norm="l2"
)
vectorizer.fit(temp["comment_clean"])

In [6]:
X_train = train["comment_clean"]
Y_train = train["Label"]
X_test = test["comment_clean"]
Y_test = test["Label"]
print(X_train.shape)
print(X_test.shape)
X_train_vec = vectorizer.transform(X_train)
X_test_vec = vectorizer.transform(X_test)
X_train_vec.get_shape()

(800,)
(200,)


(800, 454)

In [7]:
#initiate baseline classifiers
classifiers = [
    LogisticRegression(solver="sag", random_state=1),
    LinearSVC(random_state=1),
    RandomForestClassifier(random_state=1)
]
names = [re.match(r"[^\(]+", name.__str__())[0] for name in classifiers]
print(f"Classifiers to test: {names}")

Classifiers to test: ['LogisticRegression', 'LinearSVC', 'RandomForestClassifier']


In [8]:
#train and evaluate baseline models
results = {}
for name, clf in zip(names, classifiers):
    print(f"Training classifier: {name}")
    clf.fit(X_train_vec, Y_train)
    prediction = clf.predict(X_test_vec)
    report = sklearn.metrics.classification_report(Y_test, prediction)
    results[name] = report

Training classifier: LogisticRegression
Training classifier: LinearSVC
Training classifier: RandomForestClassifier


In [9]:
for k, v in results.items():
    print(f"Results for {k}:")
    print(f"{v}\n")

Results for LogisticRegression:
              precision    recall  f1-score   support

           0       0.80      0.99      0.88       151
           1       0.92      0.22      0.36        49

    accuracy                           0.81       200
   macro avg       0.86      0.61      0.62       200
weighted avg       0.83      0.81      0.76       200


Results for LinearSVC:
              precision    recall  f1-score   support

           0       0.80      0.87      0.84       151
           1       0.46      0.33      0.38        49

    accuracy                           0.74       200
   macro avg       0.63      0.60      0.61       200
weighted avg       0.72      0.74      0.72       200


Results for RandomForestClassifier:
              precision    recall  f1-score   support

           0       0.79      0.95      0.86       151
           1       0.58      0.22      0.32        49

    accuracy                           0.77       200
   macro avg       0.68      0.59  