In [1]:
!pip install simpletransformers -qqq
!pip install scikit-learn -qqq

In [2]:
import pandas as pd

train_file = pd.read_csv('/kaggle/input/comments-from-tukrish-verified-customers/Trendyol_Sentiment/train.csv')
trendyol_neg = pd.read_csv('/kaggle/input/comments-from-tukrish-verified-customers/Trendyol_Sentiment/negative_comments.csv')
trendyol_pos = pd.read_csv('/kaggle/input/comments-from-tukrish-verified-customers/Trendyol_Sentiment/negative_comments.csv')

In [3]:
train_file.shape, trendyol_neg.shape, trendyol_pos.shape

((440679, 3), (33195, 2), (33195, 2))

In [4]:
train_file = train_file[train_file['dataset'] == 'urun_yorumlari']
train_file = train_file[['text','label']]
train_file['label'] = train_file['label'].apply(lambda x: str(x).lower())
train_file.shape

(210693, 2)

In [5]:
trendyol_neg.columns = ['text', 'labels']
trendyol_pos.columns = ['text', 'labels']
train_file.columns = ['text', 'labels']

In [6]:
data = pd.concat([train_file, trendyol_neg, trendyol_pos])
data.shape, data.columns

((277083, 2), Index(['text', 'labels'], dtype='object'))

In [7]:
print(f"Amount of Positive Comments in Dataset : {data[data['labels'] == 'positive'].shape[0]}")
print(f"Amount of Negative Comments in Dataset : {data[data['labels'] == 'negative'].shape[0]}")

Amount of Positive Comments in Dataset : 197319
Amount of Negative Comments in Dataset : 79764


In [8]:
def change_names(name):
    
    if name == 'positive':
        return 'pozitif'
    if name == 'negative':
        return 'negatif'

    
data['labels'] = data['labels'].apply(lambda x: change_names(x))

In [9]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size = 0.10)
train_data.shape, test_data.shape



((249374, 2), (27709, 2))

In [10]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import os

train_args = ClassificationArgs()
train_args.num_train_epochs = 4
train_args.save_model_every_epoch = False
train_args.save_steps = 35000
train_args.save_best_model = True
train_args.fp16 = False
train_args.reprocess_input_data = True
train_args.use_multiprocessing = False
train_args.use_multiprocessing_for_evaluation = False
train_args.train_batch_size = 16
train_args.labels_list = ["pozitif","negatif"]

os.environ["TOKENIZERS_PARALLELISM"] = "false"

model = ClassificationModel("bert", "dbmdz/bert-base-turkish-cased", args = train_args, use_cuda = True)

Downloading config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/251k [00:00<?, ?B/s]

In [11]:
model.train_model(train_data)

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/15586 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/15586 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/15586 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/15586 [00:00<?, ?it/s]

(62344, 0.08643729161067708)

In [12]:
from sklearn.metrics import f1_score, accuracy_score

def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='micro')

result, model_outputs, wrong_predictions = model.eval_model(test_data, f1 = f1_multiclass, acc = accuracy_score)
print(result)

Running Evaluation:   0%|          | 0/3464 [00:00<?, ?it/s]

{'mcc': 0.9297410013824274, 'tp': 7397, 'tn': 19523, 'fp': 286, 'fn': 503, 'auroc': 0.9891157899714424, 'auprc': 0.9837850088203036, 'f1': 0.9715254971308961, 'acc': 0.9715254971308961, 'eval_loss': 0.15391181983951685}


%97 Doğru Tahmin oranı harika.

In [27]:
def make_prediction(model):
    
    comment = input("Put your comment : ")
    prediction = model.predict([str(comment)])[0][0]
    return prediction

In [35]:
tuned_model = ClassificationModel("bert", "/kaggle/working/outputs", args = train_args, use_cuda = True)
# make_prediction(tuned_model)

In [36]:
import os
import tarfile

def pack_model(model_path='',file_name=''):
    files = [files for root, dirs, files in os.walk(model_path)][0]
    with tarfile.open(file_name+ '.tar.gz', 'w:gz') as f:
        for file in files:
            f.add(f'{model_path}/{file}')

pack_model('/kaggle/working/outputs','turkish_tuned_bert')