In [3]:
import pandas as pd
import numpy as np
from openai import OpenAI

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

from sklearn.svm import SVC

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten

import lightgbm as lgb

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
client = OpenAI(
  api_key='API Key',  # this is also the default, it can be omitted
)

data = pd.read_excel('/content/drive/MyDrive/grad/ai-system/data/에타 1차 라벨링.xlsx')
texts = data['merged'].tolist()
labels = data['label'].tolist()

In [7]:
def get_embedding(text, model="text-embedding-ada-002"):
    if not isinstance(text, str):
        text = str(text)
    text = text.replace('\n', ' ')
    return client.embeddings.create(input=[text], model=model).data[0].embedding


embeddings = [get_embedding(text) for text in texts]

In [8]:
embeddings_array = np.array(embeddings)
np.savetxt('/content/drive/MyDrive/grad/ai-system/data/test.csv', embeddings_array, delimiter=',')
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=42)

## LightGBM

In [14]:
X_train_lgb = np.array(X_train)
X_test_lgb = np.array(X_test)

d_train = lgb.Dataset(X_train_lgb, label=y_train)
d_test = lgb.Dataset(X_test_lgb, label=y_test, reference=d_train)

params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'learning_rate': 0.0001,
    'num_leaves': 31,
    'min_child_samples': 20,
    'colsample_bytree': 0.7,
    'min_child_weight': 0.001,
    'subsample_for_bin': 200000,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1
}


In [15]:

lgb_model = lgb.train(
    params,
    d_train,
    valid_sets=[d_train, d_test],
    num_boost_round=500,
    )


y_pred_lgb = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)
y_pred_lgb = (y_pred_lgb >= 0.5).astype(int)
accuracy_lgb = accuracy_score(y_test, y_pred_lgb)
f1_macro_lgb = f1_score(y_test, y_pred_lgb, average='macro')
f1_micro_lgb = f1_score(y_test, y_pred_lgb, average='micro')

print(f'LightGBM Accuracy: {accuracy_lgb}')
print(f'LightGBM F1 score (macro): {f1_macro_lgb}')
print(f'LightGBM F1 score (micro): {f1_micro_lgb}')

[LightGBM] [Info] Number of positive: 4118, number of negative: 5810
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.894661 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 391680
[LightGBM] [Info] Number of data points in the train set: 9928, number of used features: 1536
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.414786 -> initscore=-0.344213
[LightGBM] [Info] Start training from score -0.344213
LightGBM Accuracy: 0.5932339911397503
LightGBM F1 score (macro): 0.3723458038422649
LightGBM F1 score (micro): 0.5932339911397503


## SVM

In [None]:
clf = SVC()
clf.fit(X_train, y_train)
y_pred_svc = clf.predict(X_test)
accuracy_svc = accuracy_score(y_test, y_pred_svc)
f1_macro_svc = f1_score(y_test, y_pred_svc, average='macro')
f1_micro_svc = f1_score(y_test, y_pred_svc, average='micro')

print(f'SVC Accuracy: {accuracy_svc}')
print(f'SVC F1 score (macro): {f1_macro_svc}')
print(f'SVC F1 score (micro): {f1_micro_svc}')

Accuracy: 0.7885622231171969
F1 score (macro): 0.7672098125195879
F1 score (micro): 0.7885622231171969
