In [51]:
from sentence_transformers import SentenceTransformer

import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
import re
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import seaborn as sns
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
import plotly.express as px
import numpy as np
import nltk
import jieba
from arabic_reshaper import reshape
import re

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout



based on : https://pypi.org/project/sentence-transformers/

In [52]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [53]:
df = pd.read_csv('data/test_labeled.csv') 

sentences = pd.DataFrame(df)

sentences = sentences.drop_duplicates()

In [54]:
def preprocess_text(text):
    text = text.lower() 
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text) 
    text = re.sub(r'[^a-zA-Z\s]', '', text)  
    return text


sentences['cleaned_premise'] = sentences['premise'].apply(preprocess_text)
sentences['cleaned_hypothesis'] = sentences['hypothesis'].apply(preprocess_text)
print(sentences['cleaned_premise'])

0                                                        
1                                                        
2       et cela est en grande partie d au fait que les...
3                                                  imaamp
4                                                        
                              ...                        
5190                                                     
5191    the  rock  has a soft texture and can be bough...
5192                                                     
5193    isnt it i can remember ive only been here eigh...
5194    in hong kong you can have a plate or even a wh...
Name: cleaned_premise, Length: 5195, dtype: object


In [55]:
embeddings_premises = model.encode(sentences['cleaned_premise'].tolist(), show_progress_bar=True)
embeddings_hypotheses = model.encode(sentences['cleaned_hypothesis'].tolist(), show_progress_bar=True)


Batches: 100%|██████████| 163/163 [00:20<00:00,  7.80it/s]
Batches: 100%|██████████| 163/163 [00:12<00:00, 12.68it/s]


In [56]:
print(embeddings_premises.shape)
print(embeddings_hypotheses.shape)

(5195, 384)
(5195, 384)


In [57]:
X = np.hstack((np.array(embeddings_premises) ,np.array(embeddings_hypotheses)))

y = sentences['label'].values

print(X.shape)
print(y.shape)

(5195, 768)
(5195,)


In [58]:
indices = np.arange(len(X))

X_train, X_test, y_train, y_test, train_indices, test_indices = train_test_split(
    X, y, indices, test_size=0.2, random_state=42
)

clf =  RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)

In [59]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(4156, 768)
(4156,)
(1039, 768)
(1039,)


In [60]:
y_pred = clf.predict(X_test)

premise_predicted = sentences.iloc[test_indices]['premise']
label_real = sentences.iloc[test_indices]['label']
hypothesis_predicted = sentences.iloc[test_indices]['hypothesis']

for premise,hypothesis, label_real, label_pred in zip(premise_predicted.head(30),hypothesis_predicted.head(30),label_real.head(30), y_pred[:30]):
    print(f"Premise: {premise}\nHypothesis: {hypothesis}\nReal label: {label_real}\nPredicted label: {label_pred}\nProximity: {clf.predict_proba([X_test[0]])}\n")   
    
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Premise: LSC set a deadline of October 1, 1998, for submission of state planning reports.
Hypothesis: LSC has a deadline of October 1,1998 to submit state planning reports.
Real label: 0
Predicted label: 2
Proximity: [[0.31 0.22 0.47]]

Premise: Turizm ofisleri L'Estrie bölgesini yeniden adlandırmaya çalıştılar ancak en militan Quebecli bile Cantons de l'Est'in doğrudan, daha yaklaşık çevirisini tercih eder.
Hypothesis: Turizmciler eskisi kulağa çirkin geldiğinden, bölgenin daha iyi bir isme ihtiyacı olduğunu düşünüyor.
Real label: 1
Predicted label: 0
Proximity: [[0.31 0.22 0.47]]

Premise: Given the limits on the WTO's jurisdiction, it was probably unreasonable of Kodak to expect a real victory.
Hypothesis: Kodak was naive and is still just a baby of a company.
Real label: 1
Predicted label: 2
Proximity: [[0.31 0.22 0.47]]

Premise: uh-huh yeah yeah they're good
Hypothesis: They are all right but not great.
Real label: 1
Predicted label: 0
Proximity: [[0.31 0.22 0.47]]

Premise: À sa

In [61]:
num_classes = 3
input_dim = X.shape[1]

In [62]:
model = Sequential([
    Dense(512, activation='relu', input_shape=(input_dim,)),
    Dropout(0.3),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(num_classes, activation='softmax')  # Pour classification multi-classes
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)


Epoch 1/10


ValueError: Arguments `target` and `output` must have the same rank (ndim). Received: target.shape=(None,), output.shape=(None, 3)

In [46]:
y_pred_classes = model.predict(X_test)
print(y_pred_classes.shape)
y_pred = np.argmax(y_pred, axis=1)


[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
(1039, 3)


In [47]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 37.44%
