# BERT

In [None]:
! pip install transformers



In [None]:
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd

In [None]:
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")
bert = AutoModel.from_pretrained("dbmdz/bert-base-turkish-cased")

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Miuul-NLP/magaza_yorumlari_duygu_analizi.csv', encoding="utf-16")
df = df.dropna()

In [None]:
df.head()

Unnamed: 0,Görüş,Durum
0,"ses kalitesi ve ergonomisi rezalet, sony olduğ...",Olumsuz
1,hizli teslimat tesekkürler,Tarafsız
2,ses olayı süper....gece çalıştır sıkıntı yok.....,Olumlu
3,geldi bigün kullandık hemen bozoldu hiçtavsiye...,Olumsuz
4,Kulaklığın sesi kaliteli falan değil. Aleti öv...,Olumsuz


In [None]:
def get_embeddings(text):

  # Tokenize the text
  tokens = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)

  with torch.no_grad():
      # Get the output from BERT
      outputs = bert(**tokens)

  # The output will be a tuple containing various components, but the embeddings will be the first element
  embeddings = outputs.last_hidden_state

  return list(embeddings[0][0].cpu().numpy())

In [None]:
get_embeddings("Harika bir gün!")

[0.94014996,
 -0.15150273,
 -0.11292617,
 -0.500578,
 1.8512871,
 0.9194206,
 0.24045298,
 0.58969253,
 0.2364179,
 0.21101466,
 -0.099310435,
 -1.630967,
 -0.7358298,
 0.47735637,
 -1.5192066,
 -0.021660913,
 0.24132323,
 1.3695418,
 -1.7254006,
 0.50128084,
 -0.15161066,
 0.37898135,
 -1.0392346,
 1.3495327,
 -0.5798781,
 -1.7850611,
 -1.9805862,
 -1.0556116,
 0.68036264,
 1.716354,
 0.2072346,
 1.136086,
 0.07009561,
 -0.5709068,
 0.11128219,
 -0.71428174,
 0.53104484,
 -1.172731,
 -0.6565185,
 0.6519561,
 -0.32488015,
 -0.07250288,
 0.49978864,
 -2.1062849,
 -0.36744374,
 0.21123058,
 0.5081913,
 -1.5649892,
 -0.9894364,
 -1.4741789,
 1.0882179,
 -0.78842413,
 2.027399,
 -0.92301583,
 -0.5602822,
 0.0910885,
 -2.0187075,
 -1.0279602,
 -0.6576938,
 2.2609138,
 1.6792718,
 -0.060546566,
 -0.20890845,
 -0.28595126,
 -0.38888207,
 0.9384547,
 0.76370263,
 -1.647727,
 0.503553,
 1.5720172,
 0.38632405,
 -1.2101849,
 0.82727915,
 -0.35365245,
 -1.9714226,
 0.6920349,
 0.17050055,
 -1.565

In [None]:
X = df.iloc[:,0]
y = df.iloc[:,1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
y_train

8977       Olumlu
2054     Tarafsız
11001     Olumsuz
10532     Olumsuz
2362      Olumsuz
           ...   
11287     Olumsuz
5192       Olumlu
5391       Olumlu
860       Olumsuz
7273       Olumlu
Name: Durum, Length: 7655, dtype: object

In [None]:
mapping = {'Olumsuz': 0, 'Tarafsız': 1, 'Olumlu': 2}

In [None]:
X_train_final = []
y_train_final = []

for text, label in zip(X_train, y_train):
  X_train_final.append(get_embeddings(text))
  y_train_final.append(mapping[label])

In [None]:
X_test_final = []
y_test_final = []

for text, label in zip(X_test, y_test):
  X_test_final.append(get_embeddings(text))
  y_test_final.append(mapping[label])

In [None]:
model = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=1000, activation='tanh', solver='adam', alpha=1e-5, learning_rate='constant',
                      verbose=1, early_stopping=True)
model.fit(X_train_final, y_train_final)
y_true, y_pred = y_test_final, model.predict(X_test_final)
print(classification_report(y_true, y_pred))

Iteration 1, loss = 1.06351458
Validation score: 0.539164
Iteration 2, loss = 0.89620879
Validation score: 0.613577
Iteration 3, loss = 0.83275517
Validation score: 0.659269
Iteration 4, loss = 0.78448619
Validation score: 0.627937
Iteration 5, loss = 0.76050133
Validation score: 0.678851
Iteration 6, loss = 0.74235810
Validation score: 0.671018
Iteration 7, loss = 0.72626629
Validation score: 0.654047
Iteration 8, loss = 0.72044637
Validation score: 0.701044
Iteration 9, loss = 0.69358593
Validation score: 0.655352
Iteration 10, loss = 0.68569588
Validation score: 0.686684
Iteration 11, loss = 0.71221940
Validation score: 0.695822
Iteration 12, loss = 0.68943563
Validation score: 0.691906
Iteration 13, loss = 0.65847517
Validation score: 0.699739
Iteration 14, loss = 0.67266415
Validation score: 0.691906
Iteration 15, loss = 0.64907558
Validation score: 0.682768
Iteration 16, loss = 0.64510272
Validation score: 0.703655
Iteration 17, loss = 0.63225938
Validation score: 0.707572
Iterat

In [None]:
len(X_train_final[0])

768

In [None]:
len(X_test_final[0])

768

In [None]:
model.predict([get_embeddings("Puan kırmamın sebebi trendyol express kuryelerinin sorumsuzluğu yoksa ürün sorunsuz")])

array([1])