In [3]:
import re
import pickle
import numpy as np
import pandas as pd
import urllib.request
import string

import spacy

from textattack.augmentation import WordNetAugmenter,EmbeddingAugmenter,CLAREAugmenter

from sentence_transformers import losses, util
from sentence_transformers import LoggingHandler, SentenceTransformer, evaluation
from sentence_transformers.readers import InputExample

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import xgboost

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score , classification_report

Loading and Saving the spacy NLP model

In [8]:
nlp = spacy.load("en_core_web_sm")



In [65]:
nlp.to_disk('/content/spacy_nlp_model/')

Reading train data from drive

In [9]:
file_lines = []

In [10]:
for file_line in urllib.request.urlopen(r'content/drive/MyDrive/msr_paraphrase_train.txt'):
    file_lines.append(file_line.decode('utf-8').split('\t'))

In [11]:
len(file_lines)

4077

In [12]:
df = pd.DataFrame(file_lines[1:],columns=['is_paraphrase','sentence_1_id','sentence_2_id','sentence_1','sentence_2'])

In [13]:
df.shape

(4076, 5)

In [14]:
df.isnull().sum()

is_paraphrase    0
sentence_1_id    0
sentence_2_id    0
sentence_1       0
sentence_2       0
dtype: int64

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4076 entries, 0 to 4075
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   is_paraphrase  4076 non-null   object
 1   sentence_1_id  4076 non-null   object
 2   sentence_2_id  4076 non-null   object
 3   sentence_1     4076 non-null   object
 4   sentence_2     4076 non-null   object
dtypes: object(5)
memory usage: 159.3+ KB


In [16]:
df.duplicated(subset=['sentence_1_id','sentence_2_id']).sum()

0

Data Cleansing

In [17]:
df['sentence_1'] = df['sentence_1'].str.lower()

df['sentence_2'] = df['sentence_2'].str.lower()

In [18]:
df.head()

Unnamed: 0,is_paraphrase,sentence_1_id,sentence_2_id,sentence_1,sentence_2
0,1,702876,702977,"amrozi accused his brother , whom he called "" ...","referring to him as only "" the witness "" , amr..."
1,0,2108705,2108831,yucaipa owned dominick 's before selling the c...,yucaipa bought dominick 's in 1995 for $ 693 m...
2,1,1330381,1330521,they had published an advertisement on the int...,"on june 10 , the ship 's owners had published ..."
3,0,3344667,3344648,"around 0335 gmt , tab shares were up 19 cents ...","tab shares jumped 20 cents , or 4.6 % , to set..."
4,1,1236820,1236712,"the stock rose $ 2.11 , or about 11 percent , ...",pg & e corp. shares jumped $ 1.63 or 8 percent...


In [19]:
df['sentence_1'] = df['sentence_1'].apply(lambda x:" ".join(token.lemma_ for token in nlp(x) if not token.is_stop and str(token)
                                           not in string.punctuation))

df['sentence_2'] = df['sentence_2'].apply(lambda x:" ".join(token.lemma_ for token in nlp(x) if not token.is_stop and str(token)
                                           not in string.punctuation))

In [20]:
df['sentence_1'] = df['sentence_1'].apply(lambda x: ' '.join(text for text in x.split() if len(text)>=2))

df['sentence_2'] = df['sentence_2'].apply(lambda x: ' '.join(text for text in x.split() if len(text)>=2))

In [21]:
df.head()

Unnamed: 0,is_paraphrase,sentence_1_id,sentence_2_id,sentence_1,sentence_2
0,1,702876,702977,amrozi accuse brother call witness deliberatel...,refer witness amrozi accuse brother deliberate...
1,0,2108705,2108831,yucaipa own dominick sell chain safeway 1998 2...,yucaipa buy dominick 1995 693 million sell saf...
2,1,1330381,1330521,publish advertisement internet june 10 offer c...,june 10 ship owner publish advertisement inter...
3,0,3344667,3344648,0335 gmt tab share 19 cent 4.4 4.56 having ear...,tab share jump 20 cent 4.6 set record closing ...
4,1,1236820,1236712,stock rise 2.11 11 percent close friday 21.51 ...,pg corp share jump 1.63 percent 21.03 new york...


In [22]:
df.is_paraphrase.value_counts(normalize=True)*100

1    67.541708
0    32.458292
Name: is_paraphrase, dtype: float64

In [23]:
df.dtypes

is_paraphrase    object
sentence_1_id    object
sentence_2_id    object
sentence_1       object
sentence_2       object
dtype: object

In [24]:
df['is_paraphrase'] = df['is_paraphrase'].astype(int)

Text augmentation of class 0 as it is less in percentage

In [25]:
wordnet_aug = WordNetAugmenter()

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [26]:
df.drop(['sentence_1_id','sentence_2_id'],axis=1,inplace=True)

In [29]:
augmented_sentences = []

In [30]:
for sent_1,sent_2 in zip(df[df.is_paraphrase==0]['sentence_1'],df[df.is_paraphrase==0]['sentence_2']):
  augmented_sentences.append(([wordnet_aug.augment(sent_1)[0],wordnet_aug.augment(sent_2)[0],0]))

In [31]:
len(augmented_sentences)

1323

In [32]:
df = pd.concat([df.sort_index(axis=1),
           pd.DataFrame(augmented_sentences,columns=['sentence_1',	'sentence_2','is_paraphrase']).sort_index(axis = 1)])

In [33]:
df.shape

(5399, 3)

In [34]:
df.is_paraphrase.value_counts(normalize=True)*100

1    50.990924
0    49.009076
Name: is_paraphrase, dtype: float64

Loading the Sentence Transformer

In [35]:
model = SentenceTransformer('stsb-distilbert-base')

Downloading (…)0e06e/.gitattributes:   0%|          | 0.00/523 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)230880e06e/README.md:   0%|          | 0.00/3.93k [00:00<?, ?B/s]

Downloading (…)0880e06e/config.json:   0%|          | 0.00/539 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)0e06e/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/489 [00:00<?, ?B/s]

Downloading (…)230880e06e/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)880e06e/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [36]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

Saving the Sentence Transformer model

In [47]:
model.save('/content/saved_transformer_model/','sentence_transformer')

In [37]:
df.head()

Unnamed: 0,is_paraphrase,sentence_1,sentence_2
0,1,amrozi accuse brother call witness deliberatel...,refer witness amrozi accuse brother deliberate...
1,0,yucaipa own dominick sell chain safeway 1998 2...,yucaipa buy dominick 1995 693 million sell saf...
2,1,publish advertisement internet june 10 offer c...,june 10 ship owner publish advertisement inter...
3,0,0335 gmt tab share 19 cent 4.4 4.56 having ear...,tab share jump 20 cent 4.6 set record closing ...
4,1,stock rise 2.11 11 percent close friday 21.51 ...,pg corp share jump 1.63 percent 21.03 new york...


In [38]:
df.reset_index(drop=True,inplace=True)

Encoding the Sentence 1 and Sentence using Sentence Transformer model

In [48]:
sentence_1_embeddings = pd.DataFrame(model.encode(df.sentence_1))

In [49]:
sentence_1_embeddings.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.260895,0.412031,-0.027431,0.125726,0.42701,0.299631,-0.248495,-0.448269,0.749812,-0.251422,...,0.13347,0.26686,-0.670629,0.518169,-0.009075,0.574645,0.201454,0.10539,1.006216,-0.567166
1,-0.884175,0.551168,0.441661,-0.074343,0.188482,-0.119824,0.720323,0.403503,-0.280382,0.082825,...,-1.115554,-0.028066,-0.308501,0.781138,-0.139435,0.790209,0.612716,-0.295901,0.458324,0.777391
2,-0.049955,0.525107,-0.177829,-0.065544,0.191447,-0.250593,0.1321,-0.07132,0.119501,-1.119786,...,-1.076448,-0.924019,0.405157,0.793631,-0.409317,0.941842,0.07094,0.817481,0.541185,-1.192215
3,0.76637,0.171768,0.776429,0.559702,-0.495387,-0.002028,-0.050219,0.136188,0.328508,-0.816549,...,-0.517657,-0.390023,0.183276,-0.162227,-0.014474,-1.138557,0.44203,0.039512,0.450362,1.091931
4,-0.005412,0.311941,-0.105217,-1.111692,0.879313,0.218348,-0.350755,0.248333,-0.325978,-0.947769,...,-0.182315,0.942314,0.336114,0.000308,0.47782,0.680415,-0.815447,0.30301,0.336258,-1.576618


In [50]:
sentence_1_embeddings.columns = ['sent1_'+str(i+1) for i in range(sentence_1_embeddings.shape[1])]

In [51]:
sentence_1_embeddings.head()

Unnamed: 0,sent1_1,sent1_2,sent1_3,sent1_4,sent1_5,sent1_6,sent1_7,sent1_8,sent1_9,sent1_10,...,sent1_759,sent1_760,sent1_761,sent1_762,sent1_763,sent1_764,sent1_765,sent1_766,sent1_767,sent1_768
0,-0.260895,0.412031,-0.027431,0.125726,0.42701,0.299631,-0.248495,-0.448269,0.749812,-0.251422,...,0.13347,0.26686,-0.670629,0.518169,-0.009075,0.574645,0.201454,0.10539,1.006216,-0.567166
1,-0.884175,0.551168,0.441661,-0.074343,0.188482,-0.119824,0.720323,0.403503,-0.280382,0.082825,...,-1.115554,-0.028066,-0.308501,0.781138,-0.139435,0.790209,0.612716,-0.295901,0.458324,0.777391
2,-0.049955,0.525107,-0.177829,-0.065544,0.191447,-0.250593,0.1321,-0.07132,0.119501,-1.119786,...,-1.076448,-0.924019,0.405157,0.793631,-0.409317,0.941842,0.07094,0.817481,0.541185,-1.192215
3,0.76637,0.171768,0.776429,0.559702,-0.495387,-0.002028,-0.050219,0.136188,0.328508,-0.816549,...,-0.517657,-0.390023,0.183276,-0.162227,-0.014474,-1.138557,0.44203,0.039512,0.450362,1.091931
4,-0.005412,0.311941,-0.105217,-1.111692,0.879313,0.218348,-0.350755,0.248333,-0.325978,-0.947769,...,-0.182315,0.942314,0.336114,0.000308,0.47782,0.680415,-0.815447,0.30301,0.336258,-1.576618


In [52]:
sentence_2_embeddings = pd.DataFrame(model.encode(df.sentence_2))

In [53]:
sentence_2_embeddings.columns = ['sent2_'+str(i+1) for i in range(sentence_2_embeddings.shape[1])]

In [54]:
sentence_2_embeddings.head()

Unnamed: 0,sent2_1,sent2_2,sent2_3,sent2_4,sent2_5,sent2_6,sent2_7,sent2_8,sent2_9,sent2_10,...,sent2_759,sent2_760,sent2_761,sent2_762,sent2_763,sent2_764,sent2_765,sent2_766,sent2_767,sent2_768
0,-0.288009,0.32374,-0.118352,-0.016262,0.470611,0.395372,-0.192446,-0.463177,0.730938,-0.27931,...,0.108613,0.227012,-0.658241,0.632798,-0.106747,0.809635,0.276645,0.133732,0.860767,-0.687815
1,-0.705629,0.781721,0.606958,-0.362709,0.625702,0.096322,0.546905,0.762962,-0.486452,-0.132403,...,-1.36781,0.120444,-0.51672,0.55389,-0.254249,0.973125,0.545285,-0.138923,-0.056247,1.03152
2,0.219695,0.61574,-0.517384,-0.226158,0.406606,0.02183,0.271182,0.243749,0.455174,-0.562033,...,-1.212357,-0.503011,0.192228,0.484917,-0.304897,0.63101,0.482004,0.382951,0.475484,-1.123346
3,0.859948,0.186316,0.921314,0.229,-0.132479,0.062699,-0.575621,-0.083457,-0.186976,-0.409012,...,-0.701564,-0.615746,0.493017,-0.581632,-0.159766,0.247319,0.160145,0.298432,-0.549618,0.011835
4,0.292672,0.366428,0.40976,-0.094331,0.238312,-0.138927,-0.397135,0.909424,-0.317211,-0.214648,...,-0.203724,0.622032,0.156229,-0.116579,0.281965,1.112706,-0.681683,0.207747,0.129844,-0.183399


In [55]:
embedding_df = pd.concat([sentence_1_embeddings,sentence_2_embeddings,df.is_paraphrase],axis=1)

In [56]:
X ,y = embedding_df.drop('is_paraphrase',axis=1) , embedding_df['is_paraphrase']

Train Test split

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42,stratify=y)

In [62]:
def train_get_metrics(MODEL,ml_model):
  print(f'model is {MODEL}','\n')
  print('TRAIN STATS\n')
  print('Accuracy ',accuracy_score(y_train, ml_model.predict(X_train)),'\n')
  print(classification_report(y_train, ml_model.predict(X_train)))
  print('\nTEST STATS\n')
  print('Accuracy ',accuracy_score(y_test, ml_model.predict(X_test)),'\n')
  print(classification_report(y_test, ml_model.predict(X_test)))

Trying 3 models Logistic Regression , Random Forest, XgBoost....
Logistic Regression is least overfitted so saving it for inference

In [59]:
logreg = LogisticRegression(max_iter=1000,random_state=42,solver='liblinear', C=0.35,penalty='l1')
logreg.fit(X_train, y_train)

In [63]:
train_get_metrics('LOGISTIC REGRESSION' ,logreg)

model is LOGISTIC REGRESSION 

TRAIN STATS

Accuracy  0.7989298209508129 

              precision    recall  f1-score   support

           0       0.79      0.80      0.80      2381
           1       0.80      0.80      0.80      2478

    accuracy                           0.80      4859
   macro avg       0.80      0.80      0.80      4859
weighted avg       0.80      0.80      0.80      4859


TEST STATS

Accuracy  0.6259259259259259 

              precision    recall  f1-score   support

           0       0.62      0.62      0.62       265
           1       0.63      0.64      0.63       275

    accuracy                           0.63       540
   macro avg       0.63      0.63      0.63       540
weighted avg       0.63      0.63      0.63       540



In [None]:
rf = RandomForestClassifier(n_estimators=35,max_depth= 5 ,n_jobs=-1 , random_state=42)
rf.fit(X_train, y_train)

In [None]:
train_get_metrics('RANDOM FOREST' ,rf)

model is RANDOM FOREST 

TRAIN STATS

Accuracy  0.8102490224325993 

              precision    recall  f1-score   support

           0       0.86      0.73      0.79      2381
           1       0.77      0.89      0.83      2478

    accuracy                           0.81      4859
   macro avg       0.82      0.81      0.81      4859
weighted avg       0.82      0.81      0.81      4859


TEST STATS

Accuracy  0.6277777777777778 

              precision    recall  f1-score   support

           0       0.65      0.52      0.58       265
           1       0.61      0.73      0.67       275

    accuracy                           0.63       540
   macro avg       0.63      0.63      0.62       540
weighted avg       0.63      0.63      0.62       540



In [None]:
xgb = xgboost.XGBClassifier(n_estimators=35,max_depth= 5 ,n_jobs=-1, learning_rate=0.025 , random_state=42,gamma=0.6)
xgb.fit(X_train, y_train)

In [None]:
train_get_metrics('XGBOOST' ,xgb)

model is XGBOOST 

TRAIN STATS

Accuracy  0.8423543939082115 

              precision    recall  f1-score   support

           0       0.88      0.79      0.83      2381
           1       0.81      0.89      0.85      2478

    accuracy                           0.84      4859
   macro avg       0.85      0.84      0.84      4859
weighted avg       0.85      0.84      0.84      4859


TEST STATS

Accuracy  0.6055555555555555 

              precision    recall  f1-score   support

           0       0.61      0.55      0.58       265
           1       0.60      0.65      0.63       275

    accuracy                           0.61       540
   macro avg       0.61      0.60      0.60       540
weighted avg       0.61      0.61      0.60       540



Saving the Logistic Regression model

In [None]:
pickle.dump(logreg, open('logreg.pkl', 'wb'))

Testing on test dataset

Loading all trained models

In [26]:
import sentence_transformers

logreg = pickle.load(open('/content/drive/MyDrive/logreg.pkl', 'rb'))

nlp = spacy.load('/content/drive/MyDrive/spacy_nlp_model/content/spacy_nlp_model')

model = sentence_transformers.SentenceTransformer('/content/drive/MyDrive/saved_transformer_model')



In [34]:
file_lines = []

with open(r'/content/drive/MyDrive/msr_paraphrase_test.txt',encoding='utf-8') as file:
    for file_line in file:
        file_lines.append(file_line.split('\t'))

df = pd.DataFrame(file_lines[1:],columns=['is_paraphrase','sentence_1_id','sentence_2_id','sentence_1','sentence_2'])

df.drop(['sentence_1_id','sentence_2_id'],axis=1,inplace=True)

df['sentence_1'] = df['sentence_1'].apply(lambda x:" ".join(token.lemma_ for token in nlp(x) if not token.is_stop and str(token)
                                           not in string.punctuation)).str.lower()

df['sentence_2'] = df['sentence_2'].apply(lambda x:" ".join(token.lemma_ for token in nlp(x) if not token.is_stop and str(token)
                                           not in string.punctuation)).str.lower()

df['sentence_1'] = df['sentence_1'].apply(lambda x: ' '.join(text for text in x.split() if len(text)>=2))

df['sentence_2'] = df['sentence_2'].apply(lambda x: ' '.join(text for text in x.split() if len(text)>=2))

sentence_1_embeddings = pd.DataFrame(model.encode(df.sentence_1))

sentence_1_embeddings.columns = ['sent1_'+str(i+1) for i in range(sentence_1_embeddings.shape[1])]

sentence_2_embeddings = pd.DataFrame(model.encode(df.sentence_2))

sentence_2_embeddings.columns = ['sent2_'+str(i+1) for i in range(sentence_2_embeddings.shape[1])]

embedding_df = pd.concat([sentence_1_embeddings,sentence_2_embeddings],axis=1)

print('Accuracy ',accuracy_score(df.is_paraphrase.astype(int), logreg.predict(embedding_df)),'\n')
print(classification_report(df.is_paraphrase.astype(int), logreg.predict(embedding_df)))

Accuracy  0.5866666666666667 

              precision    recall  f1-score   support

           0       0.40      0.46      0.43       578
           1       0.71      0.65      0.68      1147

    accuracy                           0.59      1725
   macro avg       0.55      0.56      0.55      1725
weighted avg       0.60      0.59      0.59      1725

