In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score


Removing unwanted columns and labeling artifacts

In [2]:
labeled_dataset=pd.read_csv('Actual set/actual_set_labeled.csv')
labeled_dataset_clean=labeled_dataset.drop(columns=['Unnamed: 0', 'annotation_id','annotator', 'created_at', 'id', 'updated_at', 'lead_time'], axis=1)
cols=['ling_element', 'target', 'sentiment']
labeled_dataset_clean[cols]=labeled_dataset_clean[cols].apply(
    lambda col: col.str.replace(
        r'\{"choices":\[(.*?)\]\}',
        lambda m: m.group(1).replace('"', ''), 
        regex=True
    )
)


Assigning to classes based on labels

In [3]:
labeled_dataset_clean["is_linguicism"]="No"
labeled_dataset_clean.loc[
    (labeled_dataset_clean["language"] == "yes") &
    (labeled_dataset_clean["target"].str.contains("DIRECT|MENTION", na=False)) & 
    (labeled_dataset_clean["ling_element"].str.contains("PHEN|WORD", na=False)) & 
    (~labeled_dataset_clean["ling_element"].str.contains("INVIS", na=False)) & 
    (labeled_dataset_clean["sentiment"] == "NEG") &
    (labeled_dataset_clean[["questions", "facts", "word_jokes", "other"]].isna().all(axis=1)),
    "is_linguicism"
] = "yes"

text=labeled_dataset_clean['full_text'].to_list()



combining categorical features with text

In [4]:

as_string = labeled_dataset_clean[['comm_aesthetic', 'correctness', 'country_patriotism', 'danger_decline',
       'dict_lit', 'emo_som', 'error_mention', 'facts',
       'language', 'ling_element', 'offensive', 'other', 'personal_quality',
       'questions', 'sarcasm', 'sentiment', 'target', 'threats',
       'visual_mocking', 'word_jokes']].apply(lambda row: ' '.join(row.values.astype(str)), axis=1).to_list()



cat_features=[]
for line in as_string:
       n=line.replace('nan',"")
       n=' '.join(n.split())
       cat_features.append(n)

cat_features

cat_text=list(zip(text, cat_features))


       

Encoding text with model

In [11]:
model = SentenceTransformer('sdadas/st-polish-paraphrase-from-distilroberta')
embeddings = model.encode(cat_text)

Preprocessing and training classifier

In [5]:
label_encoder=preprocessing.LabelEncoder()
labeled_dataset_clean['is_linguicism']= label_encoder.fit_transform(labeled_dataset_clean['is_linguicism'])


In [22]:
x_train,x_test,y_train,y_test=train_test_split(embeddings,np.asarray(labeled_dataset_clean['is_linguicism']),random_state=42)

In [23]:
print(set(y_test))

{0, 1}


In [27]:
model=SVC()
model.fit(x_train,y_train)
y_predicted = model.predict(x_test)
print(accuracy_score(y_predicted,y_test)*100)
print(precision_score(y_predicted,y_test))
print(recall_score(y_predicted,y_test,pos_label=1))

92.3076923076923
0.7115384615384616
0.925
