In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import math

import Levenshtein 
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics import jaccard_score
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

In [2]:
parser = ET.XMLParser(encoding="utf-8")
tree = ET.parse('paraphrases_gold.xml', parser=parser)
root = tree.getroot()

In [3]:
a = [i[3].text for i in root[0]]
b = [i[4].text for i in root[0]] 
c = [i[5].text for i in root[0]]

In [4]:
q = {'text1': a, 'text2': b, 'target': c}
df = pd.DataFrame(data=q)

In [5]:
def sent_sc(sent1, sent2):
    vect = CountVectorizer().fit_transform([sent1, sent2])
    cos_sim = cosine_similarity(vect.toarray())
    eucl_dist = euclidean_distances(vect.toarray())
    jac_sc = jaccard_score(vect.toarray()[0], vect.toarray()[1], average='micro')
    lev_dist =  Levenshtein.distance(sent1, sent2)/(len(sent1)+len(sent2))
    return [
            cos_sim[1][0], 
            eucl_dist[1][0], 
            jac_sc, 
            lev_dist,
            ]

In [6]:
df2 = pd.DataFrame(list(map(sent_sc, df['text1'], df['text2'])),
                   columns=[
                            'Cosine Similarity', 
                            'Euclidean Distance', 
                            'Jaccard Similarity',
                            'Levenshtein Distance'
                           ])
df2['target'] = df['target']
df2 = df2.astype({'target': int})
df2['target'] = df2['target'].apply(lambda x: 1 if x in [0, 1] else 0)

In [7]:
df2.head()

Unnamed: 0,Cosine Similarity,Euclidean Distance,Jaccard Similarity,Levenshtein Distance,target
0,0.0,3.464102,0.0,0.554348,0
1,0.285714,3.162278,0.090909,0.367647,0
2,0.503953,2.828427,0.2,0.403846,0
3,0.333333,2.828427,0.111111,0.425532,0
4,0.0,4.123106,0.0,0.439394,0


In [8]:
from sklearn.model_selection import train_test_split
X = df2.drop(['target'], axis=1)
y = df2['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [9]:
skfold = StratifiedKFold(n_splits=3, shuffle=True)

In [10]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

scores = cross_validate(logreg, X, y, cv=skfold, 
                        scoring=('accuracy', 'roc_auc', 'recall', 'precision', 'f1'))
df_sc = pd.DataFrame(list(scores.values())[2:], 
             index=['accuracy','roc_auc','recall', 'precision', 'f1'], 
             columns=['fold 1', 'fold 2', 'fold 3'])
df_sc['mean'] = list(df_sc.T.mean())
df_sc = round(df_sc, 3) 
df_sc

Unnamed: 0,fold 1,fold 2,fold 3,mean
accuracy,0.687,0.721,0.716,0.708
roc_auc,0.759,0.79,0.773,0.774
recall,0.773,0.805,0.792,0.79
precision,0.723,0.748,0.749,0.74
f1,0.747,0.775,0.77,0.764


In [11]:
logreg = LogisticRegression()
logreg.fit(X, y)
logreg.coef_

array([[ 2.58794515, -0.11846204,  2.41351461, -3.24862925]])

In [12]:
def shj(sent1, sent2):
    
    vect = CountVectorizer().fit_transform([sent1, sent2])
    cos_sim = cosine_similarity(vect.toarray())
    eucl_dist = euclidean_distances(vect.toarray())
    jac_sc = jaccard_score(vect.toarray()[0], vect.toarray()[1], average='micro')
    lev_dist =  Levenshtein.distance(sent1, sent2)/(len(sent1)+len(sent2))

    x = np.array([
        cos_sim[1][0], 
        eucl_dist[1][0], 
        jac_sc, 
        lev_dist
        ])
    w = np.array([ 2.58794515, -0.11846204,  2.41351461, -3.24862925])
    y = np.dot(x, w)
    target = round(1/(1+math.exp(-y)))
    answer = 'Фраза схожа' if target == 1 else 'Фраза не схожа'
    return answer