# Laboratory work #4 (text classification)

In [59]:
import os
import time

import pandas as pd
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

In [60]:
test_vectors_read = {}

with open('../assets/annotated-corpus/test-embeddings.tsv', 'r') as file:
    for line in file:
        parts = line.strip().split('\t')
        doc_id = parts[0]
        vector = list(map(float, parts[1:]))
        test_vectors_read[doc_id] = vector
        
test_embeddings = pd.DataFrame(test_vectors_read).T

In [61]:
test_embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
653,0.495289,-0.266323,-0.091453,0.152674,-0.808189,0.114368,1.049563,-0.023718,-0.837850,0.254162,...,0.174727,0.102926,0.880598,0.943732,-1.020145,0.770644,1.002130,-1.112512,0.108592,-0.315284
3125,0.370359,-1.005096,-0.026356,0.695353,-0.409997,-0.006719,0.692772,-0.280020,0.062000,-0.083504,...,-0.197315,0.599050,0.503911,1.238942,-1.267221,0.835303,1.312776,-0.847802,-0.173998,0.046429
12147,0.270967,-0.256214,0.257975,-0.559627,-0.339233,-0.213869,0.840644,-0.105986,-0.872054,0.732302,...,-0.171937,0.817125,0.853389,0.684906,-0.050850,0.702079,0.699235,-0.741730,0.443037,0.276468
24424,0.044118,-0.789112,-0.042332,-0.323560,-0.340616,0.088793,0.916729,0.324803,-0.526202,0.305620,...,-0.500705,0.893816,1.081470,0.702996,0.077735,0.170062,-0.108873,-0.849431,0.148822,0.082740
32978,0.118076,-0.625626,0.043046,0.386073,-0.200705,-0.092968,0.816092,-0.544482,-0.744841,0.326005,...,0.415041,-0.306861,-0.231987,1.493821,-1.085838,0.705552,0.600762,-1.260846,0.624956,-0.053267
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31602,0.083994,-0.276681,0.996645,0.191620,-0.071115,1.860284,2.320704,0.449779,1.025702,-1.907649,...,0.258293,1.323957,1.182001,0.955091,-0.388498,0.492177,0.595280,-0.372122,1.595431,1.205287
34268,0.608600,-1.246262,0.320978,0.613984,-0.113287,0.285672,1.587831,-0.518594,-0.747800,0.053591,...,0.246904,0.005980,-0.147150,1.032964,-1.301572,0.793051,0.160851,-1.285125,0.381330,-0.261942
15928,0.085141,-0.670452,-0.270584,-0.090172,-0.740661,-0.138307,1.053489,-0.173269,-0.921016,-1.000845,...,0.403759,0.381887,0.179555,0.704575,-1.847285,0.028619,0.574772,-2.065621,0.189183,-1.588602
6949,-0.440735,-0.260213,0.039870,0.119475,0.101958,0.319261,0.156243,-0.201064,-1.185301,-0.254385,...,0.260244,-0.074957,-0.351352,0.468061,-0.745169,0.609217,0.687820,-1.175745,0.232904,0.089557


In [62]:
data = []

for root, dirs, files in os.walk('../assets/annotated-corpus'):
    for file in files:
        if file.endswith('.tsv'):
            parts = root.split(os.sep)
            if len(parts) >= 2:
                train_test_val = parts[-2]  # train/test/val part
                fake_true = parts[-1]       # fake/true class
                document_index = file.split('.')[0]  # document index

                if train_test_val == 'assets':
                    continue
                data.append([document_index, train_test_val, fake_true])

In [63]:
df = pd.DataFrame(data, columns=['document_index', 'part', 'class'])
df.set_index('document_index', inplace=True)

In [64]:
df.head()

Unnamed: 0_level_0,part,class
document_index,Unnamed: 1_level_1,Unnamed: 2_level_1
19831,val,fake
34091,val,fake
24973,val,fake
18880,val,fake
32320,val,fake


In [65]:
test_embeddings = test_embeddings.merge(df, left_index=True, right_index=True, how='left')

In [66]:
test_embeddings.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,part,class
653,0.495289,-0.266323,-0.091453,0.152674,-0.808189,0.114368,1.049563,-0.023718,-0.83785,0.254162,...,0.880598,0.943732,-1.020145,0.770644,1.00213,-1.112512,0.108592,-0.315284,test,fake
3125,0.370359,-1.005096,-0.026356,0.695353,-0.409997,-0.006719,0.692772,-0.28002,0.062,-0.083504,...,0.503911,1.238942,-1.267221,0.835303,1.312776,-0.847802,-0.173998,0.046429,test,fake
12147,0.270967,-0.256214,0.257975,-0.559627,-0.339233,-0.213869,0.840644,-0.105986,-0.872054,0.732302,...,0.853389,0.684906,-0.05085,0.702079,0.699235,-0.74173,0.443037,0.276468,test,fake
24424,0.044118,-0.789112,-0.042332,-0.32356,-0.340616,0.088793,0.916729,0.324803,-0.526202,0.30562,...,1.08147,0.702996,0.077735,0.170062,-0.108873,-0.849431,0.148822,0.08274,test,fake
32978,0.118076,-0.625626,0.043046,0.386073,-0.200705,-0.092968,0.816092,-0.544482,-0.744841,0.326005,...,-0.231987,1.493821,-1.085838,0.705552,0.600762,-1.260846,0.624956,-0.053267,test,fake


In [67]:
X = test_embeddings.drop(['part', 'class'], axis=1)
y = test_embeddings['class']

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


def calculate_metrics(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    TP = np.diag(cm)
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (FP + FN + TP)

    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    f1_score = 2 * (precision * recall) / (precision + recall)
    accuracy = (TP + TN) / (TP + FP + FN + TN)

    return np.nanmean(precision), np.nanmean(recall), np.nanmean(f1_score), np.nanmean(accuracy)

In [75]:
kernels = [
    'linear', 
    'poly', 
    'rbf', 
    'sigmoid'
]
results = []

for kernel in kernels:
    model = SVC(kernel=kernel)
    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time

    y_pred = model.predict(X_test)
    precision, recall, f1_score, accuracy = calculate_metrics(y_test, y_pred)

    results.append({
        'kernel': kernel,
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score,
        'accuracy': accuracy,
        'training_time': training_time
    })

In [76]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,kernel,precision,recall,f1_score,accuracy,training_time
0,linear,0.932228,0.930605,0.931226,0.931525,0.073113
1,poly,0.940543,0.93811,0.938968,0.939276,0.069652
2,rbf,0.933012,0.930203,0.931151,0.931525,0.08422
3,sigmoid,0.848154,0.846066,0.846743,0.847545,0.119372


Best kernel: poly, because it is accurate and comparable fast.

In [101]:
def no_transform(x):
    return x

def add_transformed_features(X, func):
    if func is not no_transform:
        transformed_X = np.apply_along_axis(func, 1, X)
        transformed_X = np.nan_to_num(transformed_X, nan=0.0, posinf=0.0, neginf=0.0)
        return np.concatenate((X, transformed_X), axis=1)
    else:
        return X

def safe_sqrt(x):
    return np.sqrt(np.abs(x))

transformations = [no_transform, safe_sqrt, np.abs, np.log1p, np.cos, np.sin]
results = []

X_train_transformed = X_train.copy()
X_test_transformed = X_test.copy()

for transform in transformations:
    X_train_transformed = add_transformed_features(X_train_transformed, transform)
    X_test_transformed = add_transformed_features(X_test_transformed, transform)

    model = SVC(kernel='poly')
    model.fit(X_train_transformed, y_train)

    y_pred = model.predict(X_test_transformed)
    precision, recall, f1_score, accuracy = calculate_metrics(y_test, y_pred)

    results.append({
        'transformation': transform.__name__,
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score,
        'accuracy': accuracy
    })

  res = asanyarray(func1d(inarr_view[ind0], *args, **kwargs))
  buff[ind] = asanyarray(func1d(inarr_view[ind], *args, **kwargs))


In [102]:
X_test_transformed.shape

(774, 960)

In [103]:
results_df = pd.DataFrame(results)
print(results_df)

  transformation  precision    recall  f1_score  accuracy
0   no_transform   0.940543  0.938110  0.938968  0.939276
1      safe_sqrt   0.941708  0.939472  0.940278  0.940568
2       absolute   0.938228  0.935385  0.936347  0.936693
3          log1p   0.939383  0.936747  0.937658  0.937984
4            cos   0.933654  0.929935  0.931097  0.931525
5            sin   0.940014  0.936479  0.937610  0.937984


Adding safe_sqrt is helpful, but other features are useless.