In [7]:
import numpy as np

def levenshtein_distance(s1, s2):
    len_s1, len_s2 = len(s1) + 1, len(s2) + 1
    dp = np.zeros((len_s1, len_s2))
    for i in range(len_s1):
        dp[i][0] = i
    for j in range(len_s2):
        dp[0][j] = j
    for i in range(1, len_s1):
        for j in range(1, len_s2):
            cost = 0 if s1[i-1] == s2[j-1] else 1
            dp[i][j] = min(dp[i-1][j] + 1, dp[i][j-1] + 1, dp[i-1][j-1] + cost)
    return dp[-1][-1]

In [8]:
def jaccard_similarity(str1, str2):
    set1 = set(str1.split())
    set2 = set(str2.split())
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer()

0.6029748160380572


In [10]:
text1 = "S1/N77/C1:\n\
        Average power consumption: 27.376261\n\
        Average number of RRC connections: 1\n\
        Total PRB Used in Downlink: 13.0\n\
        Total PRB Used in Uplink: 273"
text2 = "S1/N77/C2:\n\
        Average power consumption: 43.413502 (in the first 8 instances) and 113.654007 (in the last 8 instances)\n\
        Average number of RRC connections: 1\n\
        Total PRB Used in Downlink: 63.0 (in the first 8 instances) and 273.0 (in the last 8 instances)\n\
        Total PRB Used in Uplink: 273 (in the first 8 instances) and 273.0 (in the last 8 instances)"
text3 = "S1/N77/C3:\n\
        Average power consumption: 90.934380 (in the first 8 instances) and 373.626129 (in the last 8 instances)\n\
        Average number of RRC connections: 6\n\
        Total PRB Used in Downlink: 273.0 (in the first 8 instances) and 273.0 (in the last 8 instances)\n\
        Total PRB Used in Uplink: 273 (in the first 8 instances) and 273.0 (in the last 8 instances)"

texts = [text1, text2, text3]

In [20]:
import numpy as np

j_sim = []
l_dist = []
cos_sim = []
for i in range(len(texts)):
    for j in range(i+1, len(texts)):
        similarity = jaccard_similarity(texts[i], texts[j])
        j_sim.append((texts[i], texts[j], similarity))
        similarity = levenshtein_distance(texts[i], texts[j])
        l_dist.append((texts[i], texts[j], similarity))
        tested_texts = [texts[i], texts[j]]
        tfidf_matrix = vectorizer.fit_transform(tested_texts)
        similarity = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
        cos_sim.append((texts[i], texts[j], similarity))

In [22]:
print ("Jaccard Similarity")
k = 0
for i in range(len(texts)):
    for j in range(i+1, len(texts)):
        print("text " + str(i+1) + " and text " + str(j+1) + ": ", j_sim[k][2])
        k = k + 1
        
print ("Levenshtein Distance")
k = 0
for i in range(len(texts)):
    for j in range(i+1, len(texts)):
        print("text " + str(i+1) + " and text " + str(j+1) + ": ", l_dist[i][2])
        k = k + 1
        
print ("Cosine Similarity")
k = 0
for i in range(len(texts)):
    for j in range(i+1, len(texts)):
        print("text " + str(i+1) + " and text " + str(j+1) + ": ", cos_sim[i][2])
        k = k + 1

Jaccard Similarity
text 1 and text 2:  0.5
text 1 and text 3:  0.4666666666666667
text 2 and text 3:  0.7096774193548387
Levenshtein Distance
text 1 and text 2:  201.0
text 1 and text 3:  201.0
text 2 and text 3:  200.0
Cosine Similarity
text 1 and text 2:  0.4121724079347329
text 1 and text 3:  0.4121724079347329
text 2 and text 3:  0.41791910175283836


In [25]:
text1 = "[CellID]:\n\
        Average power consumption: [avg_cons]\n\
        Average number of RRC connections: [rcc]\n\
        Total PRB Used in Downlink: [prbdown]\n\
        Total PRB Used in Uplink: [prbup]"
text2 = "[CellID]:\n\
        Average power consumption: [avg_cons] (in the first 8 instances) and [avg_cons] (in the last 8 instances)\n\
        Average number of RRC connections: [rcc]\n\
        Total PRB Used in Downlink: [prbdown] (in the first 8 instances) and [prbdown] (in the last 8 instances)\n\
        Total PRB Used in Uplink: [prbup] (in the first 8 instances) and [prbup] (in the last 8 instances)"
text3 = "[CellID]:\n\
        Average power consumption: [avg_cons] (in the first 8 instances) and [avg_cons] (in the last 8 instances)\n\
        Average number of RRC connections: [rcc]\n\
        Total PRB Used in Downlink: [prbdown] (in the first 8 instances) and [prbdown] (in the last 8 instances)\n\
        Total PRB Used in Uplink: [prbup] (in the first 8 instances) and [prbup] (in the last 8 instances)"

texts = [text1, text2, text3]

In [26]:
import numpy as np

j_sim = []
l_dist = []
cos_sim = []
for i in range(len(texts)):
    for j in range(i+1, len(texts)):
        similarity = jaccard_similarity(texts[i], texts[j])
        j_sim.append((texts[i], texts[j], similarity))
        similarity = levenshtein_distance(texts[i], texts[j])
        l_dist.append((texts[i], texts[j], similarity))
        tested_texts = [texts[i], texts[j]]
        tfidf_matrix = vectorizer.fit_transform(tested_texts)
        similarity = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
        cos_sim.append((texts[i], texts[j], similarity))

In [27]:
print ("Jaccard Similarity")
k = 0
for i in range(len(texts)):
    for j in range(i+1, len(texts)):
        print("text " + str(i+1) + " and text " + str(j+1) + ": ", j_sim[k][2])
        k = k + 1
        
print ("Levenshtein Distance")
k = 0
for i in range(len(texts)):
    for j in range(i+1, len(texts)):
        print("text " + str(i+1) + " and text " + str(j+1) + ": ", l_dist[i][2])
        k = k + 1
        
print ("Cosine Similarity")
k = 0
for i in range(len(texts)):
    for j in range(i+1, len(texts)):
        print("text " + str(i+1) + " and text " + str(j+1) + ": ", cos_sim[i][2])
        k = k + 1

Jaccard Similarity
text 1 and text 2:  0.72
text 1 and text 3:  0.72
text 2 and text 3:  1.0
Levenshtein Distance
text 1 and text 2:  200.0
text 1 and text 3:  200.0
text 2 and text 3:  200.0
Cosine Similarity
text 1 and text 2:  0.48439375564055015
text 1 and text 3:  0.48439375564055015
text 2 and text 3:  0.48439375564055015
