In [1]:
import pandas as pd

train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test_features.csv')

In [2]:
train_df.head()

Unnamed: 0,ID,Text,Category
0,969,@JuliaBradbury @SimonCalder @walsop @HodderPRI...,0
1,241,or here https://t.co/R2tO79Easn … .An in house...,1
2,820,@britshmuseum @thehistoryguy Gosh periscope is...,2
3,693,@Ophiolatrist britishmuseum The stupid #French...,1
4,421,@SassyClde We won't stop til @britishmuseum du...,1


In [3]:
train_df = train_df.drop(columns=['ID'])

In [4]:
train_df.isna().sum(), train_df.shape

(Text        0
 Category    0
 dtype: int64,
 (1600, 2))

### Statistics and analysis

In [5]:
sentiment_distribution = train_df["Category"].value_counts()
sentiment_distribution

Category
1    409
3    404
0    394
2    393
Name: count, dtype: int64

### Embeddings generation and data processing

In [6]:
from nltk.corpus import stopwords
import re
import string

stop_words = set(stopwords.words("english"))

def preprocess_sentence(sentence: str):
    lowered_sentence = sentence.lower()
    modified_sentence = re.sub(r'<.*?>|[^\x00-\x7f]', '', lowered_sentence)
    modified_sentence = re.sub(r'http\S+|www\S+', '', modified_sentence)
    modified_sentence = re.sub(r'\d+', '', modified_sentence)
    modified_sentence = re.sub(f"[{re.escape(string.punctuation)}]", " ", modified_sentence)
    stop_words = set(stopwords.words("english"))
    words = [word for word in modified_sentence.split() if word not in stop_words]
    preprocessed_text = ' '.join(words)

    return preprocessed_text

In [7]:
from sentence_transformers import SentenceTransformer
import numpy as np
import os

def generate_embeddings(x, reset: bool, save: bool, path: str):
    os.makedirs("./models_training/", exist_ok=True)

    # load precomputed embeddings if reset is False
    if not reset:
        try:
            loaded_embeddings = np.load(f"./models_training/{path}.npy")
            return loaded_embeddings
        except FileNotFoundError:
            raise ValueError(f"No file found at './models_training/{path}.npy'. Set `reset=True` to generate embeddings.")
    
    model = SentenceTransformer(
        "jinaai/jina-embeddings-v3",
        trust_remote_code=True
    )

    # generate embeddings
    embeddings = model.encode(x.tolist(), task="classification")

    # save embeddings if specified
    if save:
        np.save(f"./models_training/{path}.npy", embeddings)

    return embeddings

x = train_df["Text"]

# generate embeddings
x_training = generate_embeddings(
    x,
    reset=False,
    save=False,
    path="embeddings_train_set"
)

  from .autonotebook import tqdm as notebook_tqdm
2024-12-30 10:02:31.145565: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1735549351.193024    6216 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1735549351.207311    6216 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-30 10:02:31.336221: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
x_training_df = pd.DataFrame(x_training)
x_training_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,-0.007179,-0.060897,0.02228,0.043634,0.01895,-0.041177,-0.065404,0.085855,0.036878,-0.025925,...,-0.009631,0.025361,0.013979,-0.023528,-0.018853,-0.023757,-0.01388,0.01339,-0.021662,0.028247
1,0.075715,-0.086631,0.059901,0.042232,0.025115,-0.037768,-0.07426,0.09904,-0.01042,-0.023982,...,-0.005677,0.013244,0.007074,-0.021455,0.011347,-0.027382,-0.003265,0.005431,-0.037899,0.005365
2,0.080219,-0.125775,0.026173,0.056069,0.118006,-0.054869,-0.103726,0.068309,-0.004521,0.030138,...,-0.030087,0.047487,-0.01305,-0.017473,-0.019342,-0.02786,-0.010077,0.02938,-0.016886,0.024584
3,0.040978,-0.069892,0.062848,0.022745,0.076142,-0.048645,0.012866,0.095084,-0.030477,0.037633,...,-0.053788,0.048569,0.011122,0.002129,-0.008482,-0.021281,-0.030619,0.010108,-0.017359,-0.015522
4,0.073722,-0.094373,0.033481,0.008013,0.079975,-0.019963,-0.064617,0.083038,0.020022,0.037984,...,-0.020758,0.039884,-0.012107,-0.029328,0.001965,-0.019781,-0.017598,0.020782,-0.039647,0.022332


In [9]:
y_training_df = train_df["Category"]
y_training_df.head()

0    0
1    1
2    2
3    1
4    1
Name: Category, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split

# splitting our training dataset into 80% training and 20% validation.
x_train, x_valid, y_train, y_valid = train_test_split(
    x_training_df,
    y_training_df,
    random_state=42,
    test_size=0.2    
)

In [11]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=150, # number of trees
    eta=0.01, # learning rate
    max_depth=5, # max depth of a tree
    min_child_weight=5, # minimum sum of weights of observations required in a child
    gamma=3, # minimum loss reduction required to split
    scoring="accuracy"
)

# model fitting
xgb.fit(x_train, y_train)

Parameters: { "scoring" } are not used.



In [12]:
from sklearn.svm import SVC

svm = SVC(
    C=10.0,
    kernel="rbf",
    gamma="scale",
    class_weight="balanced"
)

svm.fit(x_train, y_train)

In [13]:
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras import layers

number_of_classes = len(set(y_train))

nn = Sequential([
    layers.BatchNormalization(input_shape=[x_train.shape[1]]),
    layers.Dense(512, activation="relu"),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(128, activation="relu"),
    layers.Dropout(0.3),
    layers.Dense(number_of_classes, activation="softmax")  # softmax for multiclass
])

nn.compile(
    optimizer="sgd",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]  
)

EPOCHS=100

history = nn.fit(
    x_train, y_train,
    validation_data=(x_valid, y_valid),
    batch_size=64,
    epochs=EPOCHS,
    verbose=0
)

  super().__init__(**kwargs)
2024-12-30 10:02:48.106653: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [14]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification


def predict_sentiment(texts):
    model_name = "tabularisai/multilingual-sentiment-analysis"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    return probabilities

In [15]:
y_pred_training_msa_raw = predict_sentiment(test_df["Text"].to_list())
y_pred_training_msa_raw

tensor([[0.0718, 0.0520, 0.1034, 0.1303, 0.6425],
        [0.4577, 0.1865, 0.1059, 0.0817, 0.1683],
        [0.1145, 0.1283, 0.2568, 0.1637, 0.3368],
        ...,
        [0.0631, 0.0727, 0.1545, 0.1706, 0.5392],
        [0.1094, 0.3935, 0.4031, 0.0698, 0.0242],
        [0.1784, 0.2388, 0.2348, 0.1361, 0.2118]])

In [16]:
y_pred_training_xgb = xgb.predict(x_valid)
y_pred_training_svm = svm.predict(x_valid)
nn_results = nn.evaluate(x_valid, y_valid, verbose=0)

In [17]:
from sklearn.metrics import accuracy_score

xgb_accuracy = accuracy_score(y_valid, y_pred_training_xgb)
print("XGB ACCURACY", xgb_accuracy)

svm_accuracy = accuracy_score(y_valid, y_pred_training_svm)
print("SVM ACCURACY", svm_accuracy)

print(f"NN ACCURACY: {nn_results[1]}")

XGB ACCURACY 0.95625
SVM ACCURACY 0.99375
NN ACCURACY: 0.9906250238418579


### Testing on test set

In [18]:
x_test = test_df["Text"]
testing_embeddings = generate_embeddings(
    x_test,
    reset=False,
    save=False,
    path="embeddings_test_set"
)

In [19]:
x_testing_df = pd.DataFrame(testing_embeddings)
x_testing_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,0.021862,-0.130099,-0.019668,0.039548,-0.009799,-0.08616,-0.070812,0.121424,0.041482,-0.026094,...,-0.02106,0.036752,0.016703,-0.002372,0.000245,0.000431,-0.033217,0.017796,-0.00962,0.016315
1,-0.000512,-0.129575,0.033669,0.006095,0.043594,-0.067595,-0.070295,0.026243,-0.023533,0.020531,...,-0.037877,0.024426,0.012127,-0.01725,-0.027054,-0.005332,-0.02612,-0.005944,-0.037988,0.007308
2,0.040873,-0.104724,0.013884,0.006526,-0.002321,-0.080198,-0.07082,0.087655,-0.010544,0.016962,...,-0.01601,0.02959,-0.006418,-0.029679,0.009974,-0.016881,-0.010459,0.018167,-0.039587,0.014713
3,0.009109,-0.129891,0.03785,0.010956,0.037078,-0.072843,-0.06738,0.025415,-0.019111,0.021206,...,-0.037702,0.023003,0.011671,-0.017107,-0.028684,-0.006375,-0.025086,-0.003011,-0.036313,0.007456
4,0.052182,-0.143613,0.069626,0.085411,0.122757,-0.030233,-0.08177,0.052891,0.036095,0.012811,...,-0.04649,0.04625,0.01211,-0.011752,0.011882,-0.044451,-0.026153,0.013751,-0.029231,0.015506


In [20]:
y_testing_svm = svm.predict(x_testing_df)

**Saving into submission file**

In [21]:
submission_df = pd.DataFrame({
    "ID": test_df["ID"],
    "Prediction": y_testing_svm
})

submission_df.to_csv("./submission.csv", index=False)
submission_df.head()

Unnamed: 0,ID,Prediction
0,1861,3
1,354,3
2,1334,1
3,906,3
4,1290,2
