## Importing Libraries:

In [38]:
#Basic Imports:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk import word_tokenize
from nltk import pos_tag
import re
import math
from collections import Counter

# Preprocessing Imports
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
import sklearn.preprocessing as preproc
from sklearn.feature_extraction import text
import pickle
import warnings
warnings.filterwarnings("ignore")
from random import shuffle
import random
random.seed(123)
from utils.syntactic_similarity_measures import SyntacticMeasures
from utils.lesk_algorithm import Lesk
from utils.semantic_similarity_measures import SemanticMeasures
from utils.wordnet import GetWordnetPos
from pre_processor import Preprocess
from sklearn.preprocessing import MinMaxScaler

# Plotting Imports:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Model Imports:
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")
from random import shuffle
import random
random.seed(123)
from sklearn.linear_model import LogisticRegression
import sklearn.model_selection
from sklearn import svm

# Metrics Import
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import classification_report 
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn import metrics
from sklearn.metrics.cluster import normalized_mutual_info_score

from collections import Counter

## Preprocessing Functions:

In [39]:
def counter_cosine_similarity(token1, token2):
    c1 = Counter(token1)
    c2 = Counter(token2)
    terms = set(c1).union(c2)
    dotprod = sum(c1.get(k, 0) * c2.get(k, 0) for k in terms)
    magA = math.sqrt(sum(c1.get(k, 0)**2 for k in terms))
    magB = math.sqrt(sum(c2.get(k, 0)**2 for k in terms))
    return dotprod / (magA * magB)

In [40]:
def length_similarity(token1, token2):
    c1 = Counter(token1)
    c2 = Counter(token2)
    lenc1 = sum(iter(c1.values()))
    lenc2 = sum(iter(c2.values()))
    lengthSim = min(lenc1, lenc2) / float(max(lenc1, lenc2))
    return lengthSim

In [41]:
def overlap_score(token1, token2):
    c1 = Counter(token1)
    c2 = Counter(token2)
    lenc1 = sum(iter(c1.values()))
    lenc2 = sum(iter(c2.values()))
    overlappingtermsCount = sum(((c1)&(c2)).values())
    overlap_score = abs((overlappingtermsCount/lenc1) - (overlappingtermsCount/lenc2))
    return overlap_score

In [42]:
def overlap2_score(token1, token2):
    c1 = Counter(token1)
    c2 = Counter(token2)
    lenc1 = sum(iter(c1.values()))
    lenc2 = sum(iter(c2.values()))
    overlappingtermsCount = sum(((c1)&(c2)).values())
    overlap2_score = (overlappingtermsCount/(lenc1+lenc2))
    return overlap2_score

In [43]:
def similarity_score(lengthSim,cosine_score):
    return lengthSim*cosine_score

In [44]:
def compute_cosine_similarity(token1, token2):
        """ compute cosine similarity """
        cosine_similarity = SyntacticMeasures.getCosineSimilarity(token1,token2)
        return cosine_similarity

In [45]:
def compute_jaccard_similarity(token1, token2):
        """ compute jaccard similarity"""
        jaccard_similarity = SyntacticMeasures.normal_jaccard_distance(token1,token2)
        return jaccard_similarity

In [46]:
def compute_lemma_jaccard_similarity(lemma1, lemma2):
        """ compute lemma jaccard similarity"""
        lemma_jaccard_similarity = SyntacticMeasures.lemma_jaccard_distance(lemma1,lemma2)
        return lemma_jaccard_similarity

In [47]:
def overall_similarity_combined(token1, token2, lemma1, lemma2):
        """ calculate combined similarity """

        R1 = compute_cosine_similarity(token1,token2)
        R2 = compute_jaccard_similarity(token1,token2)
        R3 = compute_lemma_jaccard_similarity(lemma1,lemma2)
        R = (R1+R2+R3)/3
        return R

In [48]:
 def get_lesk(ques):
        """ get each word meaning out of the given question"""
        lesk_obj = Lesk(ques)
        sentence_means = []
        for word in ques:
            sentence_means.append(lesk_obj.lesk(word, ques))
        return sentence_means

In [49]:
def semantic_similarities(token1, token2):
    sentence_means1 = get_lesk(token1)
    sentence_means2 = get_lesk(token2)
    
    RWUP = SemanticMeasures.computeWup(sentence_means1, sentence_means2)
    OverallWUP = SemanticMeasures.overallSim(sentence_means1, sentence_means2, RWUP)
    RSIM = SemanticMeasures.computePath(sentence_means1, sentence_means2)
    OverallSIM = SemanticMeasures.overallSim(sentence_means1, sentence_means2, RSIM)
    RCOMBINED = (RWUP + RSIM)/2
    OverallCombined = SemanticMeasures.overallSim(sentence_means1, sentence_means2, RCOMBINED)
    
    score_list = [OverallWUP,OverallSIM, OverallCombined]
    return score_list

In [50]:
def thePreprocessorNoLemma(token1):
    processor = Preprocess(token1)
    token = processor.preprocess_without_lemma()
    return token

In [51]:
def thePreprocessorLemma(lemma1):
    processor = Preprocess(lemma1)
    lemma_token = processor.preprocess_with_lemma()
    return lemma_token

## Preprocessing of Training Dataset:

In [52]:
train_df3 = pd.read_csv("../Data/train_with_label.txt", delimiter = "r'\t", header = None, engine = 'python')
train_df3 = train_df3[0].str.split("\t", expand=True)
train_df3 = train_df3.rename(columns={0: "id", 1: "sentence1", 2: "sentence2", 3: "classification"})
train_df3["classification"] = pd.to_numeric(train_df3["classification"])
train_df3.drop_duplicates(inplace = True)
train_df3

#Text Cleaning Features:
train_df3['Text_Cleaned1'] = list(map(thePreprocessorNoLemma, train_df3.sentence1))
train_df3['Text_Cleaned2'] = list(map(thePreprocessorNoLemma, train_df3.sentence2))
train_df3['lemmatized_text1'] = list(map(thePreprocessorLemma, train_df3.sentence1))
train_df3['lemmatized_text2'] = list(map(thePreprocessorLemma, train_df3.sentence2))

NameError: name 're' is not defined

In [None]:
#Syntactic Features:
train_df3['cosine_similarity_score'] = list(map(counter_cosine_similarity, train_df3.Text_Cleaned1, train_df3.Text_Cleaned2))
train_df3['length_similarity'] = list(map(length_similarity, train_df3.Text_Cleaned1, train_df3.Text_Cleaned2))
train_df3['overlap_score'] = list(map(overlap_score, train_df3.Text_Cleaned1, train_df3.Text_Cleaned2))
train_df3['overlap2_score'] = list(map(overlap2_score, train_df3.Text_Cleaned1, train_df3.Text_Cleaned2))
train_df3['cosine/length_ratio'] = list(map(similarity_score, train_df3.length_similarity, train_df3.cosine_similarity_score))
train_df3['cosine_similarity_score2'] = list(map(compute_cosine_similarity, train_df3.Text_Cleaned1, train_df3.Text_Cleaned2))
train_df3['jaccard_similarity_score'] = list(map(compute_jaccard_similarity, train_df3.Text_Cleaned1, train_df3.Text_Cleaned2))
train_df3['lemma_jaccard_score'] = list(map(compute_lemma_jaccard_similarity, train_df3.lemmatized_text1, train_df3.lemmatized_text2))
train_df3['overall_sim_score'] = list(map(overall_similarity_combined, train_df3.Text_Cleaned1, train_df3.Text_Cleaned2, train_df3.lemmatized_text1, train_df3.lemmatized_text2))

In [None]:
#Semantic Features:
pdf1 = train_df3.iloc[:800]
pdf2 = train_df3.iloc[800:1600]
pdf3 = train_df3.iloc[1600:2400]
pdf4 = train_df3.iloc[2400:3200]
pdf5 = train_df3.iloc[3200:4077]

In [198]:
pdf1['scores'] = list(map(semantic_similarities, pdf1.lemmatized_text1, pdf1.lemmatized_text2))

In [199]:
pdf2['scores'] = list(map(semantic_similarities, pdf2.lemmatized_text1, pdf2.lemmatized_text2))

In [200]:
pdf3['scores'] = list(map(semantic_similarities, pdf3.lemmatized_text1, pdf3.lemmatized_text2))

In [201]:
pdf4['scores'] = list(map(semantic_similarities, pdf4.lemmatized_text1, pdf4.lemmatized_text2))

In [202]:
pdf5['scores'] = list(map(semantic_similarities, pdf5.lemmatized_text1, pdf5.lemmatized_text2))

In [203]:
train_df3 = pdf1.append(pdf2).append(pdf3).append(pdf4).append(pdf5)
train_df3[['overall_similarity_path_semantic', 'overall_similarity_wup_semantic', 'overall_similarity_combined_semantic']] = pd.DataFrame(train_df3.scores.tolist(), index= train_df3.index)
train_df3.drop(['scores'], axis=1, inplace=True)
train_df3

Unnamed: 0,id,sentence1,sentence2,classification,Text_Cleaned1,Text_Cleaned2,lemmatized_text1,lemmatized_text2,cosine_similarity_score,length_similarity,overlap_score,overlap2_score,cosine/length_ratio,cosine_similarity_score2,jaccard_similarity_score,lemma_jaccard_score,overall_sim_score,overall_similarity_path_semantic,overall_similarity_wup_semantic,overall_similarity_combined_semantic
0,train_id_0,The Democratic candidates also began announcin...,The Democratic candidates also began announcin...,1,"[The, Democratic, candidates, also, began, ann...","[The, Democratic, candidates, also, began, ann...","[democratic, candidate, also, begin, announce,...","[democratic, candidate, also, begin, announce,...",0.882353,1.000000,0.000000,0.441176,0.882353,1.000000,0.789474,0.777778,0.855750,0.906994,0.902778,0.904886
1,train_id_1,The woman was exposed to the SARS virus while ...,The woman was exposed to the SARS virus while ...,1,"[The, woman, exposed, SARS, virus, hospital, h...","[The, woman, exposed, SARS, virus, hospital, h...","[woman, expose, sars, virus, hospital, health,...","[woman, expose, sars, virus, hospital, health-...",0.721688,0.900000,0.077778,0.368421,0.649519,0.962250,0.636364,0.600000,0.732871,0.914216,0.921569,0.917892
2,train_id_2,He said the problem needs to be corrected befo...,He said the prob lem needs to be corrected bef...,1,"[He, said, problem, needs, corrected, space, s...","[He, said, prob, lem, needs, corrected, space,...","[say, problem, need, correct, space, shuttle, ...","[say, prob, lem, need, correct, space, shuttle...",0.700000,1.000000,0.000000,0.350000,0.700000,1.000000,0.538462,0.636364,0.724942,0.792256,0.664021,0.721825
3,train_id_3,A representative for Phoenix-based U-Haul decl...,"Anthony Citrano , a representative for WhenU ,...",0,"[A, representative, Phoenix-based, U-Haul, dec...","[Anthony, Citrano, representative, WhenU, decl...","[representative, phoenix-based, u-haul, declin...","[anthony, citrano, representative, whenu, decl...",0.455842,0.636364,0.207792,0.222222,0.290081,0.797724,0.285714,0.307692,0.463710,0.613296,0.523529,0.561639
4,train_id_4,The biggest threat to order seemed to be looti...,The biggest threat to order seemed to be looti...,1,"[The, biggest, threat, order, seemed, looting,...","[The, biggest, threat, order, seemed, looting,...","[big, threat, order, seem, loot, crime, includ...","[big, threat, order, seem, loot, crime, includ...",0.739940,0.764706,0.199095,0.366667,0.565837,0.874475,0.578947,0.555556,0.669659,0.771062,0.737500,0.754281
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4072,train_id_4072,"Axelrod died in his sleep of heart failure , s...",Axelrod died of heart failure while asleep at ...,1,"[Axelrod, died, sleep, heart, failure, said, d...","[Axelrod, died, heart, failure, asleep, Los, A...","[axelrod, die, sleep, heart, failure, say, dau...","[axelrod, die, heart, failure, asleep, los, an...",0.805823,0.750000,0.222222,0.380952,0.604367,0.886405,0.583333,0.583333,0.684357,0.567460,0.493651,0.518707
4073,train_id_4073,"Saddam 's other son , Odai , surrendered Frida...","Hussein 's other son , Uday , surrendered yest...",1,"[Saddam, 's, son, Odai, surrendered, Friday, A...","[Hussein, 's, son, Uday, surrendered, yesterda...","[saddam, 's, son, odai, surrender, friday, ame...","[hussein, 's, son, uday, surrender, yesterday,...",0.714286,1.000000,0.000000,0.333333,0.714286,1.000000,0.466667,0.466667,0.644444,0.848856,0.847222,0.847222
4074,train_id_4074,If Senator Clinton does decide to run in 2008 ...,If Mrs Clinton does decide to contest the 2008...,1,"[If, Senator, Clinton, decide, run, 2008, anno...","[If, Mrs, Clinton, decide, contest, 2008, elec...","[senator, clinton, decide, run, 2008, announce...","[mr, clinton, decide, contest, 2008, election,...",0.800641,0.923077,0.064103,0.400000,0.739053,0.960769,0.666667,0.642857,0.756764,0.834569,0.789855,0.809704
4075,train_id_4075,"The Iranian refugee who sewed up his eyes , li...","An Iranian Kurd who stitched up his eyes , lip...",1,"[The, Iranian, refugee, sewed, eyes, lips, ear...","[An, Iranian, Kurd, stitched, eyes, lips, ears...","[iranian, refugee, sew, eye, lip, ear, protest...","[iranian, kurd, stitch, eye, lip, ear, protest...",0.518875,0.928571,0.038462,0.259259,0.481812,0.963624,0.350000,0.388889,0.567504,0.700889,0.562857,0.631617
