# Preprocessing of ML 4523 Datasets:

## Importing Libraries:

In [129]:
#Basic Imports:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk import word_tokenize
from nltk import pos_tag
import re
import math
from collections import Counter


# Preprocessing Imports
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
import sklearn.preprocessing as preproc
from sklearn.feature_extraction import text
import pickle
import warnings
warnings.filterwarnings("ignore")
from random import shuffle
import random
random.seed(123)
from utils.syntactic_similarity_measures import SyntacticMeasures
from utils.lesk_algorithm import Lesk
from utils.semantic_similarity_measures import SemanticMeasures
from utils.wordnet import GetWordnetPos
from pre_processor import Preprocess

## Functions:

In [2]:
# A list of contractions from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [3]:
def clean_text(text, remove_stopwords = True):
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''
    
    # Convert words to lower case
    text = text.lower()
    
    # Replace contractions with their longer forms 
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
    
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    # remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    # Tokenize each word
    text =  nltk.WordPunctTokenizer().tokenize(text)
        
    return text

In [38]:
def counter_cosine_similarity(token1, token2):
    c1 = Counter(token1)
    c2 = Counter(token2)
    terms = set(c1).union(c2)
    dotprod = sum(c1.get(k, 0) * c2.get(k, 0) for k in terms)
    magA = math.sqrt(sum(c1.get(k, 0)**2 for k in terms))
    magB = math.sqrt(sum(c2.get(k, 0)**2 for k in terms))
    return dotprod / (magA * magB)

In [41]:
def length_similarity(token1, token2):
    c1 = Counter(token1)
    c2 = Counter(token2)
    lenc1 = sum(iter(c1.values()))
    lenc2 = sum(iter(c2.values()))
    lengthSim = min(lenc1, lenc2) / float(max(lenc1, lenc2))
    return lengthSim

In [46]:
def overlap_score(token1, token2):
    c1 = Counter(token1)
    c2 = Counter(token2)
    lenc1 = sum(iter(c1.values()))
    lenc2 = sum(iter(c2.values()))
    overlappingtermsCount = sum(((c1)&(c2)).values())
    overlap_score = abs((overlappingtermsCount/lenc1) - (overlappingtermsCount/lenc2))
    return overlap_score

In [43]:
def overlap2_score(token1, token2):
    c1 = Counter(token1)
    c2 = Counter(token2)
    lenc1 = sum(iter(c1.values()))
    lenc2 = sum(iter(c2.values()))
    overlappingtermsCount = sum(((c1)&(c2)).values())
    overlap2_score = (overlappingtermsCount/(lenc1+lenc2))
    return overlap2_score

In [44]:
def similarity_score(lengthSim,cosine_score):
    return lengthSim*cosine_score

In [5]:
def compute_cosine_similarity(token1, token2):
        """ compute cosine similarity """
        cosine_similarity = SyntacticMeasures.getCosineSimilarity(token1,token2)
        return cosine_similarity

In [6]:
def compute_jaccard_similarity(token1, token2):
        """ compute jaccard similarity"""
        jaccard_similarity = SyntacticMeasures.normal_jaccard_distance(token1,token2)
        return jaccard_similarity

In [7]:
def compute_lemma_jaccard_similarity(lemma1, lemma2):
        """ compute lemma jaccard similarity"""
        lemma_jaccard_similarity = SyntacticMeasures.lemma_jaccard_distance(lemma1,lemma2)
        return lemma_jaccard_similarity

In [8]:
def compute_combined_score_syn(r1, r2, r3):
        """ get the combined score"""
        return (r1+r2+r3)/ 3

In [9]:
def overall_similarity_combined(token1, token2, lemma1, lemma2):
        """ calculate combined similarity """

        R1 = compute_cosine_similarity(token1,token2)
        R2 = compute_jaccard_similarity(token1,token2)
        R3 = compute_lemma_jaccard_similarity(lemma1,lemma2)
        R = compute_combined_score_syn(R1,R2,R3)
        return R

In [10]:
 def get_lesk(ques):
        """ get each word meaning out of the given question"""
        lesk_obj = Lesk(ques)
        sentence_means = []
        for word in ques:
            sentence_means.append(lesk_obj.lesk(word, ques))
        return sentence_means

In [60]:
def semantic_similarities(token1, token2):
    sentence_means1 = get_lesk(token1)
    sentence_means2 = get_lesk(token2)
    
    RWUP = SemanticMeasures.computeWup(sentence_means1, sentence_means2)
    OverallWUP = SemanticMeasures.overallSim(sentence_means1, sentence_means2, RWUP)
    RSIM = SemanticMeasures.computePath(sentence_means1, sentence_means2)
    OverallSIM = SemanticMeasures.overallSim(sentence_means1, sentence_means2, RSIM)
    RCOMBINED = (RWUP + RSIM)/2
    OverallCombined = SemanticMeasures.overallSim(sentence_means1, sentence_means2, RCOMBINED)
    
    score_list = [OverallWUP,OverallSIM, OverallCombined]
    return score_list

## Preprocessing of Normal Training Dataset:

In [364]:
train_df = pd.read_csv("../Data/train_with_label.txt", delimiter = "r'\t", header = None, engine = 'python')
train_df = train_df[0].str.split("\t", expand=True)
train_df = train_df.rename(columns={0: "id", 1: "sentence1", 2: "sentence2", 3: "classification"})
train_df["classification"] = pd.to_numeric(train_df["classification"])
train_df.drop_duplicates(inplace = True)
train_df

lemm = nltk.stem.WordNetLemmatizer()

#Syntactic Features:
train_df['Text_Cleaned1'] = list(map(clean_text, train_df.sentence1))
train_df['lemmatized_text1'] = list(map(lambda word:list(map(lemm.lemmatize, word)),train_df.Text_Cleaned1))
train_df['Text_Cleaned2'] = list(map(clean_text, train_df.sentence2))
train_df['lemmatized_text2'] = list(map(lambda word:list(map(lemm.lemmatize, word)),train_df.Text_Cleaned2))
train_df['cosine_similarity_score'] = list(map(counter_cosine_similarity, train_df.Text_Cleaned1, train_df.Text_Cleaned2))
train_df['length_similarity'] = list(map(length_similarity, train_df.Text_Cleaned1, train_df.Text_Cleaned2))
train_df['overlap_score'] = list(map(overlap_score, train_df.Text_Cleaned1, train_df.Text_Cleaned2))
train_df['overlap2_score'] = list(map(overlap2_score, train_df.Text_Cleaned1, train_df.Text_Cleaned2))
train_df['cosine/length_ratio'] = list(map(similarity_score, train_df.length_similarity, train_df.cosine_similarity_score))
train_df['cosine_similarity_score2'] = list(map(compute_cosine_similarity, train_df.Text_Cleaned1, train_df.Text_Cleaned2))
train_df['jaccard_similarity_score'] = list(map(compute_jaccard_similarity, train_df.Text_Cleaned1, train_df.Text_Cleaned2))
train_df['lemma_jaccard_score'] = list(map(compute_lemma_jaccard_similarity, train_df.lemmatized_text1, train_df.lemmatized_text2))
train_df['overall_sim_score'] = list(map(overall_similarity_combined, train_df.Text_Cleaned1, train_df.Text_Cleaned2, train_df.lemmatized_text1, train_df.lemmatized_text2))

In [119]:
#Semantic Features:
df1 = train_df.iloc[:800]
df2 = train_df.iloc[800:1600]
df3 = train_df.iloc[1600:2400]
df4 = train_df.iloc[2400:3200]
df5 = train_df.iloc[3200:4077]

In [120]:
df1['scores'] = list(map(semantic_similarities, df1.lemmatized_text1, df1.lemmatized_text2))

In [121]:
df2['scores'] = list(map(semantic_similarities, df2.lemmatized_text1, df2.lemmatized_text2))

In [122]:
df3['scores'] = list(map(semantic_similarities, df3.lemmatized_text1, df3.lemmatized_text2))

In [123]:
df4['scores'] = list(map(semantic_similarities, df4.lemmatized_text1, df4.lemmatized_text2))

In [124]:
df5['scores'] = list(map(semantic_similarities, df5.lemmatized_text1, df5.lemmatized_text2))

In [125]:
train_df = df1.append(df2).append(df3).append(df4).append(df5)
train_df[['overall_similarity_path_semantic', 'overall_similarity_wup_semantic', 'overall_similarity_combined_semantic']] = pd.DataFrame(train_df.scores.tolist(), index= train_df.index)
train_df.drop(['scores'], axis=1, inplace=True)
train_df

Unnamed: 0,id,sentence1,sentence2,classification,Text_Cleaned1,lemmatized_text1,Text_Cleaned2,lemmatized_text2,cosine_similarity_score,length_similarity,overlap_score,overlap2_score,cosine/length_ratio,cosine_similarity_score2,jaccard_similarity_score,lemma_jaccard_score,overall_sim_score,overall_similarity_path_semantic,overall_similarity_wup_semantic,overall_similarity_combined_semantic
0,train_id_0,The Democratic candidates also began announcin...,The Democratic candidates also began announcin...,1,"[democratic, candidates, also, began, announci...","[democratic, candidate, also, began, announcin...","[democratic, candidates, also, began, announci...","[democratic, candidate, also, began, announcin...",0.909509,0.941176,0.055147,0.454545,0.856008,0.970143,0.833333,0.833333,0.878936,0.919913,0.915825,0.917869
1,train_id_1,The woman was exposed to the SARS virus while ...,The woman was exposed to the SARS virus while ...,1,"[woman, exposed, sars, virus, hospital, health...","[woman, exposed, sars, virus, hospital, health...","[woman, exposed, sars, virus, hospital, health...","[woman, exposed, sars, virus, hospital, health...",0.904534,0.888889,0.111111,0.470588,0.804030,0.954786,0.937500,0.937500,0.943262,0.870098,0.877451,0.873775
2,train_id_2,He said the problem needs to be corrected befo...,He said the prob lem needs to be corrected bef...,1,"[said, problem, needs, corrected, space, shutt...","[said, problem, need, corrected, space, shuttl...","[said, prob, lem, needs, corrected, space, shu...","[said, prob, lem, need, corrected, space, shut...",0.666667,1.000000,0.000000,0.333333,0.666667,1.000000,0.500000,0.636364,0.712121,0.825926,0.741799,0.780159
3,train_id_3,A representative for Phoenix-based U-Haul decl...,"Anthony Citrano , a representative for WhenU ,...",0,"[representative, phoenix, based, u, haul, decl...","[representative, phoenix, based, u, haul, decl...","[anthony, citrano, representative, whenu, decl...","[anthony, citrano, representative, whenu, decl...",0.455842,0.636364,0.207792,0.222222,0.290081,0.797724,0.285714,0.285714,0.456384,0.681597,0.526984,0.577681
4,train_id_4,The biggest threat to order seemed to be looti...,The biggest threat to order seemed to be looti...,1,"[biggest, threat, order, seemed, looting, crim...","[biggest, threat, order, seemed, looting, crim...","[biggest, threat, order, seemed, looting, crim...","[biggest, threat, order, seemed, looting, crim...",0.721688,0.750000,0.208333,0.357143,0.541266,0.866025,0.555556,0.555556,0.659046,0.818681,0.794643,0.806662
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4072,train_id_4072,"Axelrod died in his sleep of heart failure , s...",Axelrod died of heart failure while asleep at ...,1,"[axelrod, died, sleep, heart, failure, said, d...","[axelrod, died, sleep, heart, failure, said, d...","[axelrod, died, heart, failure, asleep, los, a...","[axelrod, died, heart, failure, asleep, los, a...",0.805823,0.750000,0.222222,0.380952,0.604367,0.886405,0.583333,0.583333,0.684357,0.534127,0.485714,0.504819
4073,train_id_4073,"Saddam 's other son , Odai , surrendered Frida...","Hussein 's other son , Uday , surrendered yest...",1,"[saddam, son, odai, surrendered, friday, ameri...","[saddam, son, odai, surrendered, friday, ameri...","[hussein, son, uday, surrendered, yesterday, a...","[hussein, son, uday, surrendered, yesterday, a...",0.600000,1.000000,0.000000,0.300000,0.600000,1.000000,0.428571,0.538462,0.655678,0.883333,0.883333,0.883333
4074,train_id_4074,If Senator Clinton does decide to run in 2008 ...,If Mrs Clinton does decide to contest the 2008...,1,"[senator, clinton, decide, run, 2008, cannot, ...","[senator, clinton, decide, run, 2008, cannot, ...","[mrs, clinton, decide, contest, 2008, election...","[mr, clinton, decide, contest, 2008, election,...",0.819892,0.923077,0.064103,0.400000,0.756823,0.968963,0.714286,0.714286,0.799178,0.791613,0.740000,0.758261
4075,train_id_4075,"The Iranian refugee who sewed up his eyes , li...","An Iranian Kurd who stitched up his eyes , lip...",1,"[iranian, refugee, sewed, eyes, lips, ears, pr...","[iranian, refugee, sewed, eye, lip, ear, prote...","[iranian, kurd, stitched, eyes, lips, ears, pr...","[iranian, kurd, stitched, eye, lip, ear, prote...",0.560449,0.923077,0.044872,0.280000,0.517337,0.960769,0.388889,0.388889,0.579516,0.700889,0.562857,0.631617


## Preprocessing of 50-50 Randomly Removing Majority Class Normal Training Dataset:

In [341]:
subset0 = train_df.loc[train_df['classification'] == 0]
subset1 = train_df.loc[train_df['classification'] == 1]
subset1Sample = subset1.sample(n=1039,random_state=42)
new_train_df = subset0.append(subset1Sample)

## Preprocessing of Normal Training Dataset with Outliers Removed:

In [317]:
def remove_outliers(df,column_name,lower,upper):
    removed_outliers = df[column_name].between(df[column_name].quantile(lower), df[column_name].quantile(upper))
    index_names = df[~removed_outliers].index # INVERT removed_outliers!!
    return df.drop(index_names)

In [342]:
# subset0_clean = remove_outliers(subset0, 'cosine_similarity_score', .05,.95)
# #subset0_clean = remove_outliers(subset0_clean, 'length_similarity', .05,.95)
subset0_clean = remove_outliers(subset0, 'overlap_score', .00,.90)
# subset0_clean = remove_outliers(subset0_clean, 'overlap2_score', .05,.95)
subset0_clean = remove_outliers(subset0_clean, 'cosine/length_ratio', .00,.90)
# #subset0_clean = remove_outliers(subset0_clean, 'cosine_similarity_score2', .05,.95)
subset0_clean = remove_outliers(subset0_clean, 'jaccard_similarity_score', .00,.90)
subset0_clean = remove_outliers(subset0_clean, 'lemma_jaccard_score', .00,.90)
subset0_clean = remove_outliers(subset0_clean, 'overall_sim_score', .00,.90)
# subset0_clean = remove_outliers(subset0_clean, 'overall_similarity_combined_semantic', .05,.95)
# subset0_clean = remove_outliers(subset0_clean, 'overall_similarity_wup_semantic', .05,.95)
# subset0_clean = remove_outliers(subset0_clean, 'overall_similarity_path_semantic', .05,.95)
# subset0_clean = remove_outliers(subset0_clean,'jaccard_distance_bigrams',.05,.95)
# subset0_clean = remove_outliers(subset0_clean,'cosine_similarity_bigrams',.05,.95)
# subset0_clean = remove_outliers(subset0_clean,'bigram_similarity',.05,.95)
# subset0_clean = remove_outliers(subset0_clean,'jaccard_distance_trigrams',.05,.95)
subset0_clean = remove_outliers(subset0_clean,'cosine_similarity_trigrams', .00,.90)
# subset0_clean = remove_outliers(subset0_clean,'trigram_similarity',.05,.95)
# subset0_clean = remove_outliers(subset0_clean,'jaccard_distance_quadgrams',.05,.95)
# subset0_clean = remove_outliers(subset0_clean,'cosine_similarity_quadgrams',.05,.95)
# subset0_clean = remove_outliers(subset0_clean,'quadgram_similarity',.05,.95)

len(subset0_clean)

640

In [343]:
# subset1_clean = remove_outliers(subset1, 'cosine_similarity_score', .05,.95)
# # subset1_clean = remove_outliers(subset1_clean, 'length_similarity', .05,.95)
subset1_clean = remove_outliers(subset1, 'overlap_score', .10,1.0)
# subset1_clean = remove_outliers(subset1_clean, 'overlap2_score', .05,.95)
subset1_clean = remove_outliers(subset1_clean, 'cosine/length_ratio', .10,1.0)
# #subset1_clean = remove_outliers(subset1_clean, 'cosine_similarity_score2', .05,.95)
subset1_clean = remove_outliers(subset1_clean, 'jaccard_similarity_score', .10,1.0)
subset1_clean = remove_outliers(subset1_clean, 'lemma_jaccard_score', .10,1.0)
subset1_clean = remove_outliers(subset1_clean, 'overall_sim_score', .10,1.0)
# subset1_clean = remove_outliers(subset1_clean, 'overall_similarity_combined_semantic', .05,.95)
# subset1_clean = remove_outliers(subset1_clean, 'overall_similarity_wup_semantic', .05,.95)
# subset1_clean = remove_outliers(subset1_clean, 'overall_similarity_path_semantic', .05,.95)
# subset1_clean = remove_outliers(subset1_clean,'jaccard_distance_bigrams',.05,.95)
# subset1_clean = remove_outliers(subset1_clean,'cosine_similarity_bigrams',.05,.95)
# subset1_clean = remove_outliers(subset1_clean,'bigram_similarity',.05,.95)
# subset1_clean = remove_outliers(subset1_clean,'jaccard_distance_trigrams',.05,.95)
subset1_clean = remove_outliers(subset1_clean,'cosine_similarity_trigrams',.10,1.0)
# subset1_clean = remove_outliers(subset1_clean,'trigram_similarity',.05,.95)
# subset1_clean = remove_outliers(subset1_clean,'jaccard_distance_quadgrams',.05,.95)
# subset1_clean = remove_outliers(subset1_clean,'cosine_similarity_quadgrams',.05,.95)
# subset1_clean = remove_outliers(subset1_clean,'quadgram_similarity',.05,.95)
len(subset1_clean)

1650

In [344]:
new_train_df_clean = subset0_clean.append(subset1_clean)
new_train_df_clean

Unnamed: 0,classification,lemma_jaccard_score,overall_sim_score,cosine/length_ratio,cosine_similarity_trigrams,jaccard_similarity_score,overlap_score
3,0,0.285714,0.456384,0.290081,0.149071,0.285714,0.792208
7,0,0.411765,0.530084,0.362209,0.204124,0.333333,0.828571
8,0,0.388889,0.551413,0.324459,0.267261,0.388889,0.801136
11,0,0.200000,0.431476,0.268328,0.000000,0.200000,0.925000
17,0,0.380952,0.472186,0.360060,0.000000,0.380952,0.700000
...,...,...,...,...,...,...,...
4063,1,0.833333,0.870857,0.890264,0.733333,0.833333,1.000000
4064,1,0.500000,0.643223,0.493007,0.335410,0.500000,0.883333
4072,1,0.583333,0.684357,0.604367,0.239046,0.583333,0.777778
4073,1,0.538462,0.655678,0.600000,0.375000,0.428571,1.000000


## Preprocessing of 50-50 Undersampled Normal Training Dataset Using ImbLearn:

## Preprocessing of Normal Training Dataset with Other Preprocessing Method:

In [154]:
def thePreprocessorNoLemma(token1):
    processor = Preprocess(token1)
    token = processor.preprocess_without_lemma()
    return token

In [155]:
def thePreprocessorLemma(lemma1):
    processor = Preprocess(lemma1)
    lemma_token = processor.preprocess_with_lemma()
    return lemma_token

In [160]:
train_df2 = pd.read_csv("../Data/train_with_label.txt", delimiter = "r'\t", header = None, engine = 'python')
train_df2 = train_df2[0].str.split("\t", expand=True)
train_df2 = train_df2.rename(columns={0: "id", 1: "sentence1", 2: "sentence2", 3: "classification"})
train_df2["classification"] = pd.to_numeric(train_df2["classification"])
train_df2.drop_duplicates(inplace = True)
train_df2

#Text Cleaning Features:
train_df2['Text_Cleaned1'] = list(map(thePreprocessorNoLemma, train_df2.sentence1))
train_df2['Text_Cleaned2'] = list(map(thePreprocessorNoLemma, train_df2.sentence2))
train_df2['lemmatized_text1'] = list(map(thePreprocessorLemma, train_df2.sentence1))
train_df2['lemmatized_text2'] = list(map(thePreprocessorLemma, train_df2.sentence2))

Unnamed: 0,id,sentence1,sentence2,classification,Text_Cleaned1,Text_Cleaned2,lemmatized_text1,lemmatized_text2
0,train_id_0,The Democratic candidates also began announcin...,The Democratic candidates also began announcin...,1,"[The, Democratic, candidates, also, began, ann...","[The, Democratic, candidates, also, began, ann...","[democratic, candidate, also, begin, announce,...","[democratic, candidate, also, begin, announce,..."
1,train_id_1,The woman was exposed to the SARS virus while ...,The woman was exposed to the SARS virus while ...,1,"[The, woman, exposed, SARS, virus, hospital, h...","[The, woman, exposed, SARS, virus, hospital, h...","[woman, expose, sars, virus, hospital, health,...","[woman, expose, sars, virus, hospital, health-..."
2,train_id_2,He said the problem needs to be corrected befo...,He said the prob lem needs to be corrected bef...,1,"[He, said, problem, needs, corrected, space, s...","[He, said, prob, lem, needs, corrected, space,...","[say, problem, need, correct, space, shuttle, ...","[say, prob, lem, need, correct, space, shuttle..."
3,train_id_3,A representative for Phoenix-based U-Haul decl...,"Anthony Citrano , a representative for WhenU ,...",0,"[A, representative, Phoenix-based, U-Haul, dec...","[Anthony, Citrano, representative, WhenU, decl...","[representative, phoenix-based, u-haul, declin...","[anthony, citrano, representative, whenu, decl..."
4,train_id_4,The biggest threat to order seemed to be looti...,The biggest threat to order seemed to be looti...,1,"[The, biggest, threat, order, seemed, looting,...","[The, biggest, threat, order, seemed, looting,...","[big, threat, order, seem, loot, crime, includ...","[big, threat, order, seem, loot, crime, includ..."
...,...,...,...,...,...,...,...,...
4072,train_id_4072,"Axelrod died in his sleep of heart failure , s...",Axelrod died of heart failure while asleep at ...,1,"[Axelrod, died, sleep, heart, failure, said, d...","[Axelrod, died, heart, failure, asleep, Los, A...","[axelrod, die, sleep, heart, failure, say, dau...","[axelrod, die, heart, failure, asleep, los, an..."
4073,train_id_4073,"Saddam 's other son , Odai , surrendered Frida...","Hussein 's other son , Uday , surrendered yest...",1,"[Saddam, 's, son, Odai, surrendered, Friday, A...","[Hussein, 's, son, Uday, surrendered, yesterda...","[saddam, 's, son, odai, surrender, friday, ame...","[hussein, 's, son, uday, surrender, yesterday,..."
4074,train_id_4074,If Senator Clinton does decide to run in 2008 ...,If Mrs Clinton does decide to contest the 2008...,1,"[If, Senator, Clinton, decide, run, 2008, anno...","[If, Mrs, Clinton, decide, contest, 2008, elec...","[senator, clinton, decide, run, 2008, announce...","[mr, clinton, decide, contest, 2008, election,..."
4075,train_id_4075,"The Iranian refugee who sewed up his eyes , li...","An Iranian Kurd who stitched up his eyes , lip...",1,"[The, Iranian, refugee, sewed, eyes, lips, ear...","[An, Iranian, Kurd, stitched, eyes, lips, ears...","[iranian, refugee, sew, eye, lip, ear, protest...","[iranian, kurd, stitch, eye, lip, ear, protest..."


In [161]:
#Syntactic Features:
train_df2['cosine_similarity_score'] = list(map(counter_cosine_similarity, train_df2.Text_Cleaned1, train_df2.Text_Cleaned2))
train_df2['length_similarity'] = list(map(length_similarity, train_df2.Text_Cleaned1, train_df2.Text_Cleaned2))
train_df2['overlap_score'] = list(map(overlap_score, train_df2.Text_Cleaned1, train_df2.Text_Cleaned2))
train_df2['overlap2_score'] = list(map(overlap2_score, train_df2.Text_Cleaned1, train_df2.Text_Cleaned2))
train_df2['cosine/length_ratio'] = list(map(similarity_score, train_df2.length_similarity, train_df2.cosine_similarity_score))
train_df2['cosine_similarity_score2'] = list(map(compute_cosine_similarity, train_df2.Text_Cleaned1, train_df2.Text_Cleaned2))
train_df2['jaccard_similarity_score'] = list(map(compute_jaccard_similarity, train_df2.Text_Cleaned1, train_df2.Text_Cleaned2))
train_df2['lemma_jaccard_score'] = list(map(compute_lemma_jaccard_similarity, train_df2.lemmatized_text1, train_df2.lemmatized_text2))
train_df2['overall_sim_score'] = list(map(overall_similarity_combined, train_df2.Text_Cleaned1, train_df2.Text_Cleaned2, train_df2.lemmatized_text1, train_df2.lemmatized_text2))

In [162]:
#Semantic Features:
pdf1 = train_df2.iloc[:800]
pdf2 = train_df2.iloc[800:1600]
pdf3 = train_df2.iloc[1600:2400]
pdf4 = train_df2.iloc[2400:3200]
pdf5 = train_df2.iloc[3200:4077]

In [163]:
pdf1['scores'] = list(map(semantic_similarities, pdf1.lemmatized_text1, pdf1.lemmatized_text2))

In [164]:
pdf2['scores'] = list(map(semantic_similarities, pdf2.lemmatized_text1, pdf2.lemmatized_text2))

In [166]:
pdf3['scores'] = list(map(semantic_similarities, pdf3.lemmatized_text1, pdf3.lemmatized_text2))

In [167]:
pdf4['scores'] = list(map(semantic_similarities, pdf4.lemmatized_text1, pdf4.lemmatized_text2))

In [168]:
pdf5['scores'] = list(map(semantic_similarities, pdf5.lemmatized_text1, pdf5.lemmatized_text2))

In [169]:
train_df2 = pdf1.append(pdf2).append(pdf3).append(pdf4).append(pdf5)
train_df2[['overall_similarity_path_semantic', 'overall_similarity_wup_semantic', 'overall_similarity_combined_semantic']] = pd.DataFrame(train_df2.scores.tolist(), index= train_df2.index)
train_df2.drop(['scores'], axis=1, inplace=True)
train_df2

Unnamed: 0,id,sentence1,sentence2,classification,Text_Cleaned1,Text_Cleaned2,lemmatized_text1,lemmatized_text2,cosine_similarity_score,length_similarity,overlap_score,overlap2_score,cosine/length_ratio,cosine_similarity_score2,jaccard_similarity_score,lemma_jaccard_score,overall_sim_score,overall_similarity_path_semantic,overall_similarity_wup_semantic,overall_similarity_combined_semantic
0,train_id_0,The Democratic candidates also began announcin...,The Democratic candidates also began announcin...,1,"[The, Democratic, candidates, also, began, ann...","[The, Democratic, candidates, also, began, ann...","[democratic, candidate, also, begin, announce,...","[democratic, candidate, also, begin, announce,...",0.882353,1.000000,0.000000,0.441176,0.882353,1.000000,0.789474,0.777778,0.855750,0.906994,0.902778,0.904886
1,train_id_1,The woman was exposed to the SARS virus while ...,The woman was exposed to the SARS virus while ...,1,"[The, woman, exposed, SARS, virus, hospital, h...","[The, woman, exposed, SARS, virus, hospital, h...","[woman, expose, sars, virus, hospital, health,...","[woman, expose, sars, virus, hospital, health-...",0.721688,0.900000,0.077778,0.368421,0.649519,0.962250,0.636364,0.600000,0.732871,0.914216,0.921569,0.917892
2,train_id_2,He said the problem needs to be corrected befo...,He said the prob lem needs to be corrected bef...,1,"[He, said, problem, needs, corrected, space, s...","[He, said, prob, lem, needs, corrected, space,...","[say, problem, need, correct, space, shuttle, ...","[say, prob, lem, need, correct, space, shuttle...",0.700000,1.000000,0.000000,0.350000,0.700000,1.000000,0.538462,0.636364,0.724942,0.792256,0.664021,0.721825
3,train_id_3,A representative for Phoenix-based U-Haul decl...,"Anthony Citrano , a representative for WhenU ,...",0,"[A, representative, Phoenix-based, U-Haul, dec...","[Anthony, Citrano, representative, WhenU, decl...","[representative, phoenix-based, u-haul, declin...","[anthony, citrano, representative, whenu, decl...",0.455842,0.636364,0.207792,0.222222,0.290081,0.797724,0.285714,0.307692,0.463710,0.613296,0.523529,0.561639
4,train_id_4,The biggest threat to order seemed to be looti...,The biggest threat to order seemed to be looti...,1,"[The, biggest, threat, order, seemed, looting,...","[The, biggest, threat, order, seemed, looting,...","[big, threat, order, seem, loot, crime, includ...","[big, threat, order, seem, loot, crime, includ...",0.739940,0.764706,0.199095,0.366667,0.565837,0.874475,0.578947,0.555556,0.669659,0.771062,0.737500,0.754281
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4072,train_id_4072,"Axelrod died in his sleep of heart failure , s...",Axelrod died of heart failure while asleep at ...,1,"[Axelrod, died, sleep, heart, failure, said, d...","[Axelrod, died, heart, failure, asleep, Los, A...","[axelrod, die, sleep, heart, failure, say, dau...","[axelrod, die, heart, failure, asleep, los, an...",0.805823,0.750000,0.222222,0.380952,0.604367,0.886405,0.583333,0.583333,0.684357,0.567460,0.493651,0.518707
4073,train_id_4073,"Saddam 's other son , Odai , surrendered Frida...","Hussein 's other son , Uday , surrendered yest...",1,"[Saddam, 's, son, Odai, surrendered, Friday, A...","[Hussein, 's, son, Uday, surrendered, yesterda...","[saddam, 's, son, odai, surrender, friday, ame...","[hussein, 's, son, uday, surrender, yesterday,...",0.714286,1.000000,0.000000,0.333333,0.714286,1.000000,0.466667,0.466667,0.644444,0.848856,0.847222,0.847222
4074,train_id_4074,If Senator Clinton does decide to run in 2008 ...,If Mrs Clinton does decide to contest the 2008...,1,"[If, Senator, Clinton, decide, run, 2008, anno...","[If, Mrs, Clinton, decide, contest, 2008, elec...","[senator, clinton, decide, run, 2008, announce...","[mr, clinton, decide, contest, 2008, election,...",0.800641,0.923077,0.064103,0.400000,0.739053,0.960769,0.666667,0.642857,0.756764,0.834569,0.789855,0.809704
4075,train_id_4075,"The Iranian refugee who sewed up his eyes , li...","An Iranian Kurd who stitched up his eyes , lip...",1,"[The, Iranian, refugee, sewed, eyes, lips, ear...","[An, Iranian, Kurd, stitched, eyes, lips, ears...","[iranian, refugee, sew, eye, lip, ear, protest...","[iranian, kurd, stitch, eye, lip, ear, protest...",0.518875,0.928571,0.038462,0.259259,0.481812,0.963624,0.350000,0.388889,0.567504,0.700889,0.562857,0.631617


## 50-50 Randomly Removing Majority Class Training Dataset with Other Preprocessing Method:

In [345]:
subset0_2 = train_df2.loc[train_df2['classification'] == 0]
subset1_2 = train_df2.loc[train_df2['classification'] == 1]
subset1Sample_2 = subset1_2.sample(n=1039,random_state=42)
new_train_df_2 = subset0_2.append(subset1Sample_2)

## Preprocessing of Normal Training Dataset with Other Preprocessing Method, Outliers Removed:

In [346]:
# subset0_clean_2 = remove_outliers(subset0_2, 'cosine_similarity_score', .05,.95)
# #subset0_clean_2 = remove_outliers(subset0_clean_2, 'length_similarity', .05,.95)
subset0_clean_2 = remove_outliers(subset0_2, 'overlap_score', .00,.90)
# subset0_clean_2 = remove_outliers(subset0_clean_2, 'overlap2_score', .05,.95)
subset0_clean_2 = remove_outliers(subset0_clean_2, 'cosine/length_ratio', .00,.90)
# #subset0_clean_2 = remove_outliers(subset0_clean_2, 'cosine_similarity_score2', .05,.95)
subset0_clean_2 = remove_outliers(subset0_clean_2, 'jaccard_similarity_score', .00,.90)
subset0_clean_2 = remove_outliers(subset0_clean_2, 'lemma_jaccard_score', .00,.90)
subset0_clean_2 = remove_outliers(subset0_clean_2, 'overall_sim_score', .00,.90)
# subset0_clean_2 = remove_outliers(subset0_clean_2, 'overall_similarity_combined_semantic', .05,.95)
# subset0_clean_2 = remove_outliers(subset0_clean_2, 'overall_similarity_wup_semantic', .05,.95)
# subset0_clean_2 = remove_outliers(subset0_clean_2, 'overall_similarity_path_semantic', .05,.95)
# subset0_clean_2 = remove_outliers(subset0_clean_2,'jaccard_distance_bigrams',.05,.95)
# subset0_clean_2 = remove_outliers(subset0_clean_2,'cosine_similarity_bigrams',.05,.95)
# subset0_clean_2 = remove_outliers(subset0_clean_2,'bigram_similarity',.05,.95)
# subset0_clean_2 = remove_outliers(subset0_clean_2,'jaccard_distance_trigrams',.05,.95)
subset0_clean_2 = remove_outliers(subset0_clean_2,'cosine_similarity_trigrams', .00,.90)
# subset0_clean_2 = remove_outliers(subset0_clean_2,'trigram_similarity',.05,.95)
# subset0_clean_2 = remove_outliers(subset0_clean_2,'jaccard_distance_quadgrams',.05,.95)
# subset0_clean_2 = remove_outliers(subset0_clean_2,'cosine_similarity_quadgrams',.05,.95)
# subset0_clean_2 = remove_outliers(subset0_clean_2,'quadgram_similarity',.05,.95)
len(subset0_clean_2)

628

In [347]:
# subset1_clean_2 = remove_outliers(subset1_2, 'cosine_similarity_score', .05,.95)
# #subset1_clean_2 = remove_outliers(subset1_clean_2, 'length_similarity', .05,.95)
subset1_clean_2 = remove_outliers(subset1_2, 'overlap_score', .10,1.0)
# subset1_clean_2 = remove_outliers(subset1_clean_2, 'overlap2_score', .05,.95)
subset1_clean_2 = remove_outliers(subset1_clean_2, 'cosine/length_ratio', .10,1.0)
# #subset1_clean_2 = remove_outliers(subset1_clean_2, 'cosine_similarity_score2', .05,.95)
subset1_clean_2 = remove_outliers(subset1_clean_2, 'jaccard_similarity_score', .10,1.0)
subset1_clean_2 = remove_outliers(subset1_clean_2, 'lemma_jaccard_score', .10,1.0)
subset1_clean_2 = remove_outliers(subset1_clean_2, 'overall_sim_score', .10,1.0)
# subset1_clean_2 = remove_outliers(subset1_clean_2, 'overall_similarity_combined_semantic', .05,.95)
# subset1_clean_2 = remove_outliers(subset1_clean_2, 'overall_similarity_wup_semantic', .05,.95)
# subset1_clean_2 = remove_outliers(subset1_clean_2, 'overall_similarity_path_semantic', .05,.95)
# subset1_clean_2 = remove_outliers(subset1_clean_2,'jaccard_distance_bigrams',.05,.95)
# subset1_clean_2 = remove_outliers(subset1_clean_2,'cosine_similarity_bigrams',.05,.95)
# subset1_clean_2 = remove_outliers(subset1_clean_2,'bigram_similarity',.05,.95)
# subset1_clean_2 = remove_outliers(subset1_clean_2,'jaccard_distance_trigrams',.05,.95)
subset1_clean_2 = remove_outliers(subset1_clean_2,'cosine_similarity_trigrams',.10,1.0)
# subset1_clean_2 = remove_outliers(subset1_clean_2,'trigram_similarity',.05,.95)
# subset1_clean_2 = remove_outliers(subset1_clean_2,'jaccard_distance_quadgrams',.05,.95)
# subset1_clean_2 = remove_outliers(subset1_clean_2,'cosine_similarity_quadgrams',.05,.95)
# subset1_clean_2 = remove_outliers(subset1_clean_2,'quadgram_similarity',.05,.95)
len(subset1_clean_2)

1641

In [348]:
new_train_df_clean_2 = subset0_clean_2.append(subset1_clean_2)
new_train_df_clean_2

Unnamed: 0,classification,lemma_jaccard_score,overall_sim_score,cosine/length_ratio,cosine_similarity_trigrams,jaccard_similarity_score,overlap_score
3,0,0.307692,0.463710,0.290081,0.158114,0.285714,0.792208
7,0,0.312500,0.514166,0.404796,0.119523,0.352941,0.861538
8,0,0.421053,0.590351,0.442105,0.226134,0.450000,0.812030
11,0,0.307692,0.481292,0.284605,0.154303,0.187500,0.966667
17,0,0.315789,0.457632,0.274986,0.000000,0.350000,0.688889
...,...,...,...,...,...,...,...
4062,1,0.611111,0.736979,0.726184,0.240192,0.631579,0.950000
4063,1,0.812500,0.854779,0.876714,0.692308,0.812500,1.000000
4072,1,0.583333,0.684357,0.604367,0.239046,0.583333,0.777778
4073,1,0.466667,0.644444,0.714286,0.200000,0.466667,1.000000


## Preprocessing of Missing Training Dataset:

In [136]:
missing_train = pd.read_csv("../Data/train_with_label.txt", error_bad_lines = False, warn_bad_lines = False, engine = 'python', header = None, sep='\t', )
missing_train = missing_train.rename(columns={0: "id", 1: "sentence1", 2: "sentence2", 3: "classification"})
missing_train["classification"] = pd.to_numeric(missing_train["classification"])
missing_train.drop_duplicates(inplace = True)

Unnamed: 0,id,sentence1,sentence2,classification
0,train_id_0,The Democratic candidates also began announcin...,The Democratic candidates also began announcin...,1.0
1,train_id_1,The woman was exposed to the SARS virus while ...,The woman was exposed to the SARS virus while ...,1.0
2,train_id_2,He said the problem needs to be corrected befo...,He said the prob lem needs to be corrected bef...,1.0
3,train_id_3,A representative for Phoenix-based U-Haul decl...,"Anthony Citrano , a representative for WhenU ,...",0.0
4,train_id_4,The biggest threat to order seemed to be looti...,The biggest threat to order seemed to be looti...,1.0
...,...,...,...,...
3488,train_id_4072,"Axelrod died in his sleep of heart failure , s...",Axelrod died of heart failure while asleep at ...,1.0
3489,train_id_4073,"Saddam 's other son , Odai , surrendered Frida...","Hussein 's other son , Uday , surrendered yest...",1.0
3490,train_id_4074,If Senator Clinton does decide to run in 2008 ...,If Mrs Clinton does decide to contest the 2008...,1.0
3491,train_id_4075,"The Iranian refugee who sewed up his eyes , li...","An Iranian Kurd who stitched up his eyes , lip...",1.0


In [137]:
missing = pd.concat([train_df,missing_train]).drop_duplicates(subset = ['id','id'], keep=False)
missing

Unnamed: 0,id,sentence1,sentence2,classification,Text_Cleaned1,lemmatized_text1,Text_Cleaned2,lemmatized_text2,cosine_similarity_score,length_similarity,overlap_score,overlap2_score,cosine/length_ratio,cosine_similarity_score2,jaccard_similarity_score,lemma_jaccard_score,overall_sim_score,overall_similarity_path_semantic,overall_similarity_wup_semantic,overall_similarity_combined_semantic
8,train_id_8,""" We see the First Amendment to protect religi...",""" We put the call out , "" said the Rev. Patric...",0.0,"[see, first, amendment, protect, religious, li...","[see, first, amendment, protect, religious, li...","[put, call, said, rev, patrick, j, mahoney, di...","[put, call, said, rev, patrick, j, mahoney, di...",0.471940,0.687500,0.198864,0.259259,0.324459,0.876460,0.388889,0.388889,0.551413,0.736895,0.584127,0.653810
16,train_id_16,""" Due to economic and creative realities , man...",""" Due to economic and creative realities , man...",1.0,"[due, economic, creative, realities, many, key...","[due, economic, creative, reality, many, key, ...","[due, economic, creative, realities, many, key...","[due, economic, creative, reality, many, key, ...",0.741249,0.928571,0.054945,0.370370,0.688303,0.963624,0.588235,0.588235,0.713365,0.696502,0.622399,0.651733
19,train_id_19,""" Tomorrow at the Mission Inn , I have the opp...",""" I have the opportunity to congratulate the g...",0.0,"[tomorrow, mission, inn, opportunity, congratu...","[tomorrow, mission, inn, opportunity, congratu...","[opportunity, congratulate, governor, elect, g...","[opportunity, congratulate, governor, elect, g...",0.737865,0.900000,0.077778,0.368421,0.664078,0.948683,0.583333,0.583333,0.705117,0.907483,0.729825,0.818654
34,train_id_34,""" These despicable acts were committed by kill...",These despicable acts were committed by killer...,0.0,"[despicable, acts, committed, killers, whose, ...","[despicable, act, committed, killer, whose, fa...","[despicable, acts, committed, killers, whose, ...","[despicable, act, committed, killer, whose, fa...",0.788241,0.933333,0.052381,0.379310,0.735691,0.970143,0.588235,0.588235,0.715538,0.778325,0.764368,0.769622
35,train_id_35,""" It is about a third of what I owe in the wor...","It ain 't coming to me , but it 's only about ...",0.0,"[third, owe, world, told, reporters]","[third, owe, world, told, reporter]","[coming, third, owe, world]","[coming, third, owe, world]",0.670820,0.800000,0.150000,0.333333,0.536656,0.894427,0.500000,0.500000,0.631476,0.651852,0.519136,0.581397
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4018,train_id_4018,""" We put a lot of effort and energy into impro...",""" We 've put a lot of effort and energy into i...",0.0,"[put, lot, effort, energy, improving, patching...","[put, lot, effort, energy, improving, patching...","[put, lot, effort, energy, improving, patching...","[put, lot, effort, energy, improving, patching...",0.769800,0.750000,0.222222,0.380952,0.577350,0.866025,0.615385,0.615385,0.698932,0.959064,0.853175,0.905458
4022,train_id_4022,""" At this point , Mr. Brando announced : ' Som...","Brando said that "" somebody ought to put a bul...",1.0,"[point, mr, brando, announced, somebody, ought...","[point, mr, brando, announced, somebody, ought...","[brando, said, somebody, ought, put, bullet, h...","[brando, said, somebody, ought, put, bullet, h...",0.603023,0.818182,0.121212,0.300000,0.493382,0.904534,0.428571,0.428571,0.587226,0.578291,0.556667,0.561429
4033,train_id_4033,""" Saddam is gone , but we want the ( U.S. ) oc...",""" Saddam is gone , but we want the ( U.S. ) oc...",1.0,"[saddam, gone, want, u, occupation, end, said,...","[saddam, gone, want, u, occupation, end, said,...","[saddam, gone, want, u, occupation, end]","[saddam, gone, want, u, occupation, end]",0.738549,0.545455,0.454545,0.352941,0.402845,0.738549,0.545455,0.545455,0.609819,0.927493,0.782680,0.855086
4034,train_id_4034,"Dr. Anthony Fauci , director of the National I...",""" We have been somewhat lucky , "" said Dr. Ant...",0.0,"[dr, anthony, fauci, director, national, insti...","[dr, anthony, fauci, director, national, insti...","[somewhat, lucky, said, dr, anthony, fauci, di...","[somewhat, lucky, said, dr, anthony, fauci, di...",0.821584,0.833333,0.150000,0.409091,0.684653,0.912871,0.692308,0.692308,0.765829,0.607792,0.571212,0.587987


In [138]:
# We now let missing_train equal the difference between train_df and missing so that we do not need to repeat all the preprocessing:
missing_train = pd.concat([train_df,missing]).drop_duplicates(subset = ['id','id'], keep=False)
missing_train

Unnamed: 0,id,sentence1,sentence2,classification,Text_Cleaned1,lemmatized_text1,Text_Cleaned2,lemmatized_text2,cosine_similarity_score,length_similarity,overlap_score,overlap2_score,cosine/length_ratio,cosine_similarity_score2,jaccard_similarity_score,lemma_jaccard_score,overall_sim_score,overall_similarity_path_semantic,overall_similarity_wup_semantic,overall_similarity_combined_semantic
0,train_id_0,The Democratic candidates also began announcin...,The Democratic candidates also began announcin...,1.0,"[democratic, candidates, also, began, announci...","[democratic, candidate, also, began, announcin...","[democratic, candidates, also, began, announci...","[democratic, candidate, also, began, announcin...",0.909509,0.941176,0.055147,0.454545,0.856008,0.970143,0.833333,0.833333,0.878936,0.919913,0.915825,0.917869
1,train_id_1,The woman was exposed to the SARS virus while ...,The woman was exposed to the SARS virus while ...,1.0,"[woman, exposed, sars, virus, hospital, health...","[woman, exposed, sars, virus, hospital, health...","[woman, exposed, sars, virus, hospital, health...","[woman, exposed, sars, virus, hospital, health...",0.904534,0.888889,0.111111,0.470588,0.804030,0.954786,0.937500,0.937500,0.943262,0.870098,0.877451,0.873775
2,train_id_2,He said the problem needs to be corrected befo...,He said the prob lem needs to be corrected bef...,1.0,"[said, problem, needs, corrected, space, shutt...","[said, problem, need, corrected, space, shuttl...","[said, prob, lem, needs, corrected, space, shu...","[said, prob, lem, need, corrected, space, shut...",0.666667,1.000000,0.000000,0.333333,0.666667,1.000000,0.500000,0.636364,0.712121,0.825926,0.741799,0.780159
3,train_id_3,A representative for Phoenix-based U-Haul decl...,"Anthony Citrano , a representative for WhenU ,...",0.0,"[representative, phoenix, based, u, haul, decl...","[representative, phoenix, based, u, haul, decl...","[anthony, citrano, representative, whenu, decl...","[anthony, citrano, representative, whenu, decl...",0.455842,0.636364,0.207792,0.222222,0.290081,0.797724,0.285714,0.285714,0.456384,0.681597,0.526984,0.577681
4,train_id_4,The biggest threat to order seemed to be looti...,The biggest threat to order seemed to be looti...,1.0,"[biggest, threat, order, seemed, looting, crim...","[biggest, threat, order, seemed, looting, crim...","[biggest, threat, order, seemed, looting, crim...","[biggest, threat, order, seemed, looting, crim...",0.721688,0.750000,0.208333,0.357143,0.541266,0.866025,0.555556,0.555556,0.659046,0.818681,0.794643,0.806662
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4072,train_id_4072,"Axelrod died in his sleep of heart failure , s...",Axelrod died of heart failure while asleep at ...,1.0,"[axelrod, died, sleep, heart, failure, said, d...","[axelrod, died, sleep, heart, failure, said, d...","[axelrod, died, heart, failure, asleep, los, a...","[axelrod, died, heart, failure, asleep, los, a...",0.805823,0.750000,0.222222,0.380952,0.604367,0.886405,0.583333,0.583333,0.684357,0.534127,0.485714,0.504819
4073,train_id_4073,"Saddam 's other son , Odai , surrendered Frida...","Hussein 's other son , Uday , surrendered yest...",1.0,"[saddam, son, odai, surrendered, friday, ameri...","[saddam, son, odai, surrendered, friday, ameri...","[hussein, son, uday, surrendered, yesterday, a...","[hussein, son, uday, surrendered, yesterday, a...",0.600000,1.000000,0.000000,0.300000,0.600000,1.000000,0.428571,0.538462,0.655678,0.883333,0.883333,0.883333
4074,train_id_4074,If Senator Clinton does decide to run in 2008 ...,If Mrs Clinton does decide to contest the 2008...,1.0,"[senator, clinton, decide, run, 2008, cannot, ...","[senator, clinton, decide, run, 2008, cannot, ...","[mrs, clinton, decide, contest, 2008, election...","[mr, clinton, decide, contest, 2008, election,...",0.819892,0.923077,0.064103,0.400000,0.756823,0.968963,0.714286,0.714286,0.799178,0.791613,0.740000,0.758261
4075,train_id_4075,"The Iranian refugee who sewed up his eyes , li...","An Iranian Kurd who stitched up his eyes , lip...",1.0,"[iranian, refugee, sewed, eyes, lips, ears, pr...","[iranian, refugee, sewed, eye, lip, ear, prote...","[iranian, kurd, stitched, eyes, lips, ears, pr...","[iranian, kurd, stitched, eye, lip, ear, prote...",0.560449,0.923077,0.044872,0.280000,0.517337,0.960769,0.388889,0.388889,0.579516,0.700889,0.562857,0.631617


## Preprocessing of Missing Training Dataset with Outliers Removed:

In [349]:
subset0Missing = missing_train.loc[missing_train['classification'] == 0]
subset1Missing = missing_train.loc[missing_train['classification'] == 1]

In [350]:
# subset0_clean_Missing = remove_outliers(subset0Missing, 'cosine_similarity_score', .05,.95)
# #subset0_clean_Missing = remove_outliers(subset0_clean_Missing, 'length_similarity', .05,.95)
subset0_clean_Missing = remove_outliers(subset0Missing, 'overlap_score', .00,.90)
# subset0_clean_Missing = remove_outliers(subset0_clean_Missing, 'overlap2_score', .05,.95)
subset0_clean_Missing = remove_outliers(subset0_clean_Missing, 'cosine/length_ratio', .00,.90)
# #subset0_clean_Missing = remove_outliers(subset0_clean_Missing, 'cosine_similarity_score2', .05,.95)
subset0_clean_Missing = remove_outliers(subset0_clean_Missing, 'jaccard_similarity_score', .00,.90)
subset0_clean_Missing = remove_outliers(subset0_clean_Missing, 'lemma_jaccard_score', .00,.90)
subset0_clean_Missing = remove_outliers(subset0_clean_Missing, 'overall_sim_score', .00,.90)
# subset0_clean_Missing = remove_outliers(subset0_clean_Missing, 'overall_similarity_combined_semantic', .05,.95)
# subset0_clean_Missing = remove_outliers(subset0_clean_Missing, 'overall_similarity_wup_semantic', .05,.95)
# subset0_clean_Missing = remove_outliers(subset0_clean_Missing, 'overall_similarity_path_semantic', .05,.95)
# subset0_clean_Missing = remove_outliers(subset0_clean_Missing,'jaccard_distance_bigrams',.05,.95)
# subset0_clean_Missing = remove_outliers(subset0_clean_Missing,'cosine_similarity_bigrams',.05,.95)
# subset0_clean_Missing = remove_outliers(subset0_clean_Missing,'bigram_similarity',.05,.95)
# subset0_clean_Missing = remove_outliers(subset0_clean_Missing,'jaccard_distance_trigrams',.05,.95)
subset0_clean_Missing = remove_outliers(subset0_clean_Missing,'cosine_similarity_trigrams', .00,.90)
# subset0_clean_Missing = remove_outliers(subset0_clean_Missing,'trigram_similarity',.05,.95)
# subset0_clean_Missing = remove_outliers(subset0_clean_Missing,'jaccard_distance_quadgrams',.05,.95)
# subset0_clean_Missing = remove_outliers(subset0_clean_Missing,'cosine_similarity_quadgrams',.05,.95)
# subset0_clean_Missing = remove_outliers(subset0_clean_Missing,'quadgram_similarity',.05,.95)
len(subset0_clean_Missing)

562

In [351]:
# subset1_clean_Missing = remove_outliers(subset1Missing, 'cosine_similarity_score', .05,.95)
# #subset1_clean_Missing = remove_outliers(subset1_clean_Missing, 'length_similarity', .05,.95)
subset1_clean_Missing = remove_outliers(subset1Missing, 'overlap_score', .10,1.0)
# subset1_clean_Missing = remove_outliers(subset1_clean_Missing, 'overlap2_score', .05,.95)
subset1_clean_Missing = remove_outliers(subset1_clean_Missing, 'cosine/length_ratio', .10,1.0)
# #subset1_clean_Missing = remove_outliers(subset1_clean_Missing, 'cosine_similarity_score2', .05,.95)
subset1_clean_Missing = remove_outliers(subset1_clean_Missing, 'jaccard_similarity_score', .10,1.0)
subset1_clean_Missing = remove_outliers(subset1_clean_Missing, 'lemma_jaccard_score', .10,1.0)
subset1_clean_Missing = remove_outliers(subset1_clean_Missing, 'overall_sim_score', .10,1.0)
# subset1_clean_Missing = remove_outliers(subset1_clean_Missing, 'overall_similarity_combined_semantic', .05,.95)
# subset1_clean_Missing = remove_outliers(subset1_clean_Missing, 'overall_similarity_wup_semantic', .05,.95)
# subset1_clean_Missing = remove_outliers(subset1_clean_Missing, 'overall_similarity_path_semantic', .05,.95)
# subset1_clean_Missing = remove_outliers(subset1_clean_Missing,'jaccard_distance_bigrams',.05,.95)
# subset1_clean_Missing = remove_outliers(subset1_clean_Missing,'cosine_similarity_bigrams',.05,.95)
# subset1_clean_Missing = remove_outliers(subset1_clean_Missing,'bigram_similarity',.05,.95)
# subset1_clean_Missing = remove_outliers(subset1_clean_Missing,'jaccard_distance_trigrams',.05,.95)
subset1_clean_Missing = remove_outliers(subset1_clean_Missing,'cosine_similarity_trigrams',.10,1.0)
# subset1_clean_Missing = remove_outliers(subset1_clean_Missing,'trigram_similarity',.05,.95)
# subset1_clean_Missing = remove_outliers(subset1_clean_Missing,'jaccard_distance_quadgrams',.05,.95)
# subset1_clean_Missing = remove_outliers(subset1_clean_Missing,'cosine_similarity_quadgrams',.05,.95)
# subset1_clean_Missing = remove_outliers(subset1_clean_Missing,'quadgram_similarity',.05,.95)
len(subset1_clean_Missing)

1392

In [352]:
new_train_df_clean_Missing = subset0_clean_Missing.append(subset1_clean_Missing)
new_train_df_clean_Missing

Unnamed: 0,classification,lemma_jaccard_score,overall_sim_score,cosine/length_ratio,cosine_similarity_trigrams,jaccard_similarity_score,overlap_score
3,0.0,0.285714,0.456384,0.290081,0.149071,0.285714,0.792208
7,0.0,0.411765,0.530084,0.362209,0.204124,0.333333,0.828571
10,0.0,0.200000,0.431476,0.268328,0.000000,0.200000,0.925000
15,0.0,0.380952,0.472186,0.360060,0.000000,0.380952,0.700000
19,0.0,0.428571,0.513111,0.341096,0.314270,0.428571,0.631818
...,...,...,...,...,...,...,...
3479,1.0,0.833333,0.870857,0.890264,0.733333,0.833333,1.000000
3480,1.0,0.500000,0.643223,0.493007,0.335410,0.500000,0.883333
3488,1.0,0.583333,0.684357,0.604367,0.239046,0.583333,0.777778
3489,1.0,0.538462,0.655678,0.600000,0.375000,0.428571,1.000000


## Preprocessing of 50-50 Randomly Removing Majority Class for Missing Training Dataset:

In [143]:
missing_train['classification'].value_counts()/missing_train.shape[0]

1.0    0.74091
0.0    0.25909
Name: classification, dtype: float64

In [144]:
print(len(subset0Missing))
print(len(subset1Missing))

905
2588


In [146]:
subset1SampleMissing = subset1Missing.sample(n=905,random_state=42)
new_train_df_Missing = subset0Missing.append(subset1SampleMissing)
new_train_df_Missing

Unnamed: 0,id,sentence1,sentence2,classification,Text_Cleaned1,lemmatized_text1,Text_Cleaned2,lemmatized_text2,cosine_similarity_score,length_similarity,overlap_score,overlap2_score,cosine/length_ratio,cosine_similarity_score2,jaccard_similarity_score,lemma_jaccard_score,overall_sim_score,overall_similarity_path_semantic,overall_similarity_wup_semantic,overall_similarity_combined_semantic
3,train_id_3,A representative for Phoenix-based U-Haul decl...,"Anthony Citrano , a representative for WhenU ,...",0.0,"[representative, phoenix, based, u, haul, decl...","[representative, phoenix, based, u, haul, decl...","[anthony, citrano, representative, whenu, decl...","[anthony, citrano, representative, whenu, decl...",0.455842,0.636364,0.207792,0.222222,0.290081,0.797724,0.285714,0.285714,0.456384,0.681597,0.526984,0.577681
5,train_id_5,Crews worked to install a new culvert and prep...,Crews worked to install a new culvert and repa...,0.0,"[crews, worked, install, new, culvert, prepare...","[crew, worked, install, new, culvert, prepare,...","[crews, worked, install, new, culvert, repave,...","[crew, worked, install, new, culvert, repave, ...",0.784465,0.722222,0.256410,0.387097,0.566558,0.849837,0.631579,0.631579,0.704331,0.880645,0.860215,0.870430
7,train_id_7,"It will cost about $ 20,000 per eight-week cou...","It will cost about $ 20,000 per average course...",0.0,"[cost, 20, 000, per, eight, week, course, trea...","[cost, 20, 000, per, eight, week, course, trea...","[cost, 20, 000, per, average, course, treatmen...","[cost, 20, 000, per, average, course, treatmen...",0.507093,0.714286,0.171429,0.250000,0.362209,0.845154,0.333333,0.411765,0.530084,0.706279,0.581481,0.637763
11,train_id_11,A federal judge ruled that the monument violat...,The federal courts have ruled that the monumen...,0.0,"[federal, judge, ruled, monument, violated, la...","[federal, judge, ruled, monument, violated, la...","[federal, courts, ruled, monument, violates, c...","[federal, court, ruled, monument, violates, co...",0.335410,0.800000,0.075000,0.166667,0.268328,0.894427,0.200000,0.200000,0.431476,0.600000,0.475926,0.537963
17,train_id_17,The benchmark 10-year note US10YT = RR slipped...,The yield on the 10-year Treasury note rose to...,0.0,"[benchmark, 10, year, note, us10yt, rr, slippe...","[benchmark, 10, year, note, us10yt, rr, slippe...","[yield, 10, year, treasury, note, rose, 4, 46,...","[yield, 10, year, treasury, note, rose, 4, 46,...",0.600099,0.600000,0.300000,0.281250,0.360060,0.654654,0.380952,0.380952,0.472186,0.878325,0.732465,0.805395
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3050,train_id_3050,"By 10 p.m. , Claudette was centered about 320 ...","Early Monday , the center of Claudette was abo...",1.0,"[10, p, claudette, centered, 320, miles, east,...","[10, p, claudette, centered, 320, mile, east, ...","[early, monday, center, claudette, 300, miles,...","[early, monday, center, claudette, 300, mile, ...",0.731925,0.947368,0.038012,0.351351,0.693403,0.927105,0.521739,0.590909,0.679918,0.858281,0.801351,0.829816
98,train_id_98,Market sentiment was subdued after Internation...,Market sentiment was also cautious after Inter...,1.0,"[market, sentiment, subdued, international, bu...","[market, sentiment, subdued, international, bu...","[market, sentiment, also, cautious, internatio...","[market, sentiment, also, cautious, internatio...",0.723339,0.894737,0.080495,0.361111,0.647198,0.945905,0.565217,0.565217,0.692113,0.860529,0.773677,0.815251
607,train_id_607,Clijsters was simply too complete and powerful...,Clijsters was simply too powerful for Spanish ...,1.0,"[clijsters, simply, complete, powerful, spanis...","[clijsters, simply, complete, powerful, spanis...","[clijsters, simply, powerful, spanish, veteran...","[clijsters, simply, powerful, spanish, veteran...",0.868599,0.857143,0.130952,0.423077,0.744513,0.868599,0.714286,0.714286,0.765723,0.837912,0.819414,0.826127
1505,train_id_1505,"The companies , Chiron and Aventis Pasteur , t...",Chiron and Aventis Pasteur together made about...,1.0,"[companies, chiron, aventis, pasteur, together...","[company, chiron, aventis, pasteur, together, ...","[chiron, aventis, pasteur, together, made, 80,...","[chiron, aventis, pasteur, together, made, 80,...",0.751469,0.882353,0.094118,0.375000,0.663061,0.939336,0.600000,0.600000,0.713112,0.744528,0.704315,0.715877


## Preprocessing of Normal Training Dataset with Other Preprocessing Method Version 3:

In [194]:
train_df3 = pd.read_csv("../Data/train_with_label.txt", delimiter = "r'\t", header = None, engine = 'python')
train_df3 = train_df3[0].str.split("\t", expand=True)
train_df3 = train_df3.rename(columns={0: "id", 1: "sentence1", 2: "sentence2", 3: "classification"})
train_df3["classification"] = pd.to_numeric(train_df3["classification"])
train_df3.drop_duplicates(inplace = True)
train_df3

#Text Cleaning Features:
train_df3['Text_Cleaned1'] = list(map(thePreprocessorNoLemma, train_df3.sentence1))
train_df3['Text_Cleaned2'] = list(map(thePreprocessorNoLemma, train_df3.sentence2))
train_df3['lemmatized_text1'] = list(map(thePreprocessorLemma, train_df3.sentence1))
train_df3['lemmatized_text2'] = list(map(thePreprocessorLemma, train_df3.sentence2))

In [196]:
#Syntactic Features:
train_df3['cosine_similarity_score'] = list(map(counter_cosine_similarity, train_df3.Text_Cleaned1, train_df3.Text_Cleaned2))
train_df3['length_similarity'] = list(map(length_similarity, train_df3.Text_Cleaned1, train_df3.Text_Cleaned2))
train_df3['overlap_score'] = list(map(overlap_score, train_df3.Text_Cleaned1, train_df3.Text_Cleaned2))
train_df3['overlap2_score'] = list(map(overlap2_score, train_df3.Text_Cleaned1, train_df3.Text_Cleaned2))
train_df3['cosine/length_ratio'] = list(map(similarity_score, train_df3.length_similarity, train_df3.cosine_similarity_score))
train_df3['cosine_similarity_score2'] = list(map(compute_cosine_similarity, train_df3.Text_Cleaned1, train_df3.Text_Cleaned2))
train_df3['jaccard_similarity_score'] = list(map(compute_jaccard_similarity, train_df3.Text_Cleaned1, train_df3.Text_Cleaned2))
train_df3['lemma_jaccard_score'] = list(map(compute_lemma_jaccard_similarity, train_df3.lemmatized_text1, train_df3.lemmatized_text2))
train_df3['overall_sim_score'] = list(map(overall_similarity_combined, train_df3.Text_Cleaned1, train_df3.Text_Cleaned2, train_df3.lemmatized_text1, train_df3.lemmatized_text2))

In [197]:
#Semantic Features:
pdf1 = train_df3.iloc[:800]
pdf2 = train_df3.iloc[800:1600]
pdf3 = train_df3.iloc[1600:2400]
pdf4 = train_df3.iloc[2400:3200]
pdf5 = train_df3.iloc[3200:4077]

In [198]:
pdf1['scores'] = list(map(semantic_similarities, pdf1.lemmatized_text1, pdf1.lemmatized_text2))

In [199]:
pdf2['scores'] = list(map(semantic_similarities, pdf2.lemmatized_text1, pdf2.lemmatized_text2))

In [200]:
pdf3['scores'] = list(map(semantic_similarities, pdf3.lemmatized_text1, pdf3.lemmatized_text2))

In [201]:
pdf4['scores'] = list(map(semantic_similarities, pdf4.lemmatized_text1, pdf4.lemmatized_text2))

In [202]:
pdf5['scores'] = list(map(semantic_similarities, pdf5.lemmatized_text1, pdf5.lemmatized_text2))

In [203]:
train_df3 = pdf1.append(pdf2).append(pdf3).append(pdf4).append(pdf5)
train_df3[['overall_similarity_path_semantic', 'overall_similarity_wup_semantic', 'overall_similarity_combined_semantic']] = pd.DataFrame(train_df3.scores.tolist(), index= train_df3.index)
train_df3.drop(['scores'], axis=1, inplace=True)
train_df3

Unnamed: 0,id,sentence1,sentence2,classification,Text_Cleaned1,Text_Cleaned2,lemmatized_text1,lemmatized_text2,cosine_similarity_score,length_similarity,overlap_score,overlap2_score,cosine/length_ratio,cosine_similarity_score2,jaccard_similarity_score,lemma_jaccard_score,overall_sim_score,overall_similarity_path_semantic,overall_similarity_wup_semantic,overall_similarity_combined_semantic
0,train_id_0,The Democratic candidates also began announcin...,The Democratic candidates also began announcin...,1,"[The, Democratic, candidates, also, began, ann...","[The, Democratic, candidates, also, began, ann...","[democratic, candidate, also, begin, announce,...","[democratic, candidate, also, begin, announce,...",0.882353,1.000000,0.000000,0.441176,0.882353,1.000000,0.789474,0.777778,0.855750,0.906994,0.902778,0.904886
1,train_id_1,The woman was exposed to the SARS virus while ...,The woman was exposed to the SARS virus while ...,1,"[The, woman, exposed, SARS, virus, hospital, h...","[The, woman, exposed, SARS, virus, hospital, h...","[woman, expose, sars, virus, hospital, health,...","[woman, expose, sars, virus, hospital, health-...",0.721688,0.900000,0.077778,0.368421,0.649519,0.962250,0.636364,0.600000,0.732871,0.914216,0.921569,0.917892
2,train_id_2,He said the problem needs to be corrected befo...,He said the prob lem needs to be corrected bef...,1,"[He, said, problem, needs, corrected, space, s...","[He, said, prob, lem, needs, corrected, space,...","[say, problem, need, correct, space, shuttle, ...","[say, prob, lem, need, correct, space, shuttle...",0.700000,1.000000,0.000000,0.350000,0.700000,1.000000,0.538462,0.636364,0.724942,0.792256,0.664021,0.721825
3,train_id_3,A representative for Phoenix-based U-Haul decl...,"Anthony Citrano , a representative for WhenU ,...",0,"[A, representative, Phoenix-based, U-Haul, dec...","[Anthony, Citrano, representative, WhenU, decl...","[representative, phoenix-based, u-haul, declin...","[anthony, citrano, representative, whenu, decl...",0.455842,0.636364,0.207792,0.222222,0.290081,0.797724,0.285714,0.307692,0.463710,0.613296,0.523529,0.561639
4,train_id_4,The biggest threat to order seemed to be looti...,The biggest threat to order seemed to be looti...,1,"[The, biggest, threat, order, seemed, looting,...","[The, biggest, threat, order, seemed, looting,...","[big, threat, order, seem, loot, crime, includ...","[big, threat, order, seem, loot, crime, includ...",0.739940,0.764706,0.199095,0.366667,0.565837,0.874475,0.578947,0.555556,0.669659,0.771062,0.737500,0.754281
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4072,train_id_4072,"Axelrod died in his sleep of heart failure , s...",Axelrod died of heart failure while asleep at ...,1,"[Axelrod, died, sleep, heart, failure, said, d...","[Axelrod, died, heart, failure, asleep, Los, A...","[axelrod, die, sleep, heart, failure, say, dau...","[axelrod, die, heart, failure, asleep, los, an...",0.805823,0.750000,0.222222,0.380952,0.604367,0.886405,0.583333,0.583333,0.684357,0.567460,0.493651,0.518707
4073,train_id_4073,"Saddam 's other son , Odai , surrendered Frida...","Hussein 's other son , Uday , surrendered yest...",1,"[Saddam, 's, son, Odai, surrendered, Friday, A...","[Hussein, 's, son, Uday, surrendered, yesterda...","[saddam, 's, son, odai, surrender, friday, ame...","[hussein, 's, son, uday, surrender, yesterday,...",0.714286,1.000000,0.000000,0.333333,0.714286,1.000000,0.466667,0.466667,0.644444,0.848856,0.847222,0.847222
4074,train_id_4074,If Senator Clinton does decide to run in 2008 ...,If Mrs Clinton does decide to contest the 2008...,1,"[If, Senator, Clinton, decide, run, 2008, anno...","[If, Mrs, Clinton, decide, contest, 2008, elec...","[senator, clinton, decide, run, 2008, announce...","[mr, clinton, decide, contest, 2008, election,...",0.800641,0.923077,0.064103,0.400000,0.739053,0.960769,0.666667,0.642857,0.756764,0.834569,0.789855,0.809704
4075,train_id_4075,"The Iranian refugee who sewed up his eyes , li...","An Iranian Kurd who stitched up his eyes , lip...",1,"[The, Iranian, refugee, sewed, eyes, lips, ear...","[An, Iranian, Kurd, stitched, eyes, lips, ears...","[iranian, refugee, sew, eye, lip, ear, protest...","[iranian, kurd, stitch, eye, lip, ear, protest...",0.518875,0.928571,0.038462,0.259259,0.481812,0.963624,0.350000,0.388889,0.567504,0.700889,0.562857,0.631617


## 50-50 Randomly Removing Majority Class Training Dataset with Other Preprocessing Method Version 3:

In [353]:
subset0_3 = train_df3.loc[train_df3['classification'] == 0]
subset1_3 = train_df3.loc[train_df3['classification'] == 1]
subset1Sample_3 = subset1_3.sample(n=1039,random_state=42)
new_train_df_3 = subset0_3.append(subset1Sample_3)

## Preprocessing of Normal Training Dataset with Other Preprocessing Method Version 3, Outliers Removed:

In [354]:
# subset0_clean_3 = remove_outliers(subset0_3, 'cosine_similarity_score', .05,.95)
# #subset0_clean_3 = remove_outliers(subset0_clean_3, 'length_similarity', .05,.95)
subset0_clean_3 = remove_outliers(subset0_3, 'overlap_score', .00,.90)
# subset0_clean_3 = remove_outliers(subset0_clean_3, 'overlap2_score', .05,.95)
subset0_clean_3 = remove_outliers(subset0_clean_3, 'cosine/length_ratio', .00,.90)
# #subset0_clean_3 = remove_outliers(subset0_clean_3, 'cosine_similarity_score2', .05,.95)
subset0_clean_3 = remove_outliers(subset0_clean_3, 'jaccard_similarity_score', .00,.90)
subset0_clean_3 = remove_outliers(subset0_clean_3, 'lemma_jaccard_score', .00,.90)
subset0_clean_3 = remove_outliers(subset0_clean_3, 'overall_sim_score', .00,.90)
# subset0_clean_3 = remove_outliers(subset0_clean_3, 'overall_similarity_combined_semantic', .05,.95)
# subset0_clean_3 = remove_outliers(subset0_clean_3, 'overall_similarity_wup_semantic', .05,.95)
# subset0_clean_3 = remove_outliers(subset0_clean_3, 'overall_similarity_path_semantic', .05,.95)
# subset0_clean_3 = remove_outliers(subset0_clean_3,'jaccard_distance_bigrams',.05,.95)
# subset0_clean_3 = remove_outliers(subset0_clean_3,'cosine_similarity_bigrams',.05,.95)
# subset0_clean_3 = remove_outliers(subset0_clean_3,'bigram_similarity',.05,.95)
# subset0_clean_3 = remove_outliers(subset0_clean_3,'jaccard_distance_trigrams',.05,.95)
subset0_clean_3 = remove_outliers(subset0_clean_3,'cosine_similarity_trigrams', .00,.90)
# subset0_clean_3 = remove_outliers(subset0_clean_3,'trigram_similarity',.05,.95)
# subset0_clean_3 = remove_outliers(subset0_clean_3,'jaccard_distance_quadgrams',.05,.95)
# subset0_clean_3 = remove_outliers(subset0_clean_3,'cosine_similarity_quadgrams',.05,.95)
# subset0_clean_3 = remove_outliers(subset0_clean_3,'quadgram_similarity',.05,.95)
len(subset0_clean_3)

628

In [355]:
# subset1_clean_3 = remove_outliers(subset1_3, 'cosine_similarity_score', .05,.95)
# #subset1_clean_3 = remove_outliers(subset1_clean_3, 'length_similarity', .05,.95)
subset1_clean_3 = remove_outliers(subset1_3, 'overlap_score', .10,1.0)
# subset1_clean_3 = remove_outliers(subset1_clean_3, 'overlap2_score', .05,.95)
subset1_clean_3 = remove_outliers(subset1_clean_3, 'cosine/length_ratio', .10,1.0)
# #subset1_clean_3 = remove_outliers(subset1_clean_3, 'cosine_similarity_score2', .05,.95)
subset1_clean_3 = remove_outliers(subset1_clean_3, 'jaccard_similarity_score', .10,1.0)
subset1_clean_3 = remove_outliers(subset1_clean_3, 'lemma_jaccard_score', .10,1.0)
subset1_clean_3 = remove_outliers(subset1_clean_3, 'overall_sim_score', .10,1.0)
# subset1_clean_3 = remove_outliers(subset1_clean_3, 'overall_similarity_combined_semantic', .05,.95)
# subset1_clean_3 = remove_outliers(subset1_clean_3, 'overall_similarity_wup_semantic', .05,.95)
# subset1_clean_3 = remove_outliers(subset1_clean_3, 'overall_similarity_path_semantic', .05,.95)
# subset1_clean_3 = remove_outliers(subset1_clean_3,'jaccard_distance_bigrams',.05,.95)
# subset1_clean_3 = remove_outliers(subset1_clean_3,'cosine_similarity_bigrams',.05,.95)
# subset1_clean_3 = remove_outliers(subset1_clean_3,'bigram_similarity',.05,.95)
# subset1_clean_3 = remove_outliers(subset1_clean_3,'jaccard_distance_trigrams',.05,.95)
subset1_clean_3 = remove_outliers(subset1_clean_3,'cosine_similarity_trigrams',.10,1.0)
# subset1_clean_3 = remove_outliers(subset1_clean_3,'trigram_similarity',.05,.95)
# subset1_clean_3 = remove_outliers(subset1_clean_3,'jaccard_distance_quadgrams',.05,.95)
# subset1_clean_3 = remove_outliers(subset1_clean_3,'cosine_similarity_quadgrams',.05,.95)
# subset1_clean_3 = remove_outliers(subset1_clean_3,'quadgram_similarity',.05,.95)
len(subset1_clean_3)

1641

In [356]:
new_train_df_clean_3 = subset0_clean_3.append(subset1_clean_3)
new_train_df_clean_3

Unnamed: 0,classification,lemma_jaccard_score,overall_sim_score,cosine/length_ratio,cosine_similarity_trigrams,jaccard_similarity_score,overlap_score
3,0,0.307692,0.463710,0.290081,0.158114,0.285714,0.792208
7,0,0.312500,0.514166,0.404796,0.119523,0.352941,0.861538
8,0,0.421053,0.590351,0.442105,0.226134,0.450000,0.812030
11,0,0.307692,0.481292,0.284605,0.154303,0.187500,0.966667
17,0,0.315789,0.457632,0.274986,0.000000,0.350000,0.688889
...,...,...,...,...,...,...,...
4062,1,0.611111,0.736979,0.726184,0.240192,0.631579,0.950000
4063,1,0.812500,0.854779,0.876714,0.692308,0.812500,1.000000
4072,1,0.583333,0.684357,0.604367,0.239046,0.583333,0.777778
4073,1,0.466667,0.644444,0.714286,0.200000,0.466667,1.000000


## Preprocessing of 50-50 Randomly Removing Majority Class and Outlier Removal for Normal Training Dataset:

In [357]:
subset1SampleNormalOutlier = subset1_clean.sample(n=len(subset0_clean), random_state=42)
new_train_df_clean_randomUnder = subset0_clean.append(subset1SampleNormalOutlier)
new_train_df_clean_randomUnder

Unnamed: 0,classification,lemma_jaccard_score,overall_sim_score,cosine/length_ratio,cosine_similarity_trigrams,jaccard_similarity_score,overlap_score
3,0,0.285714,0.456384,0.290081,0.149071,0.285714,0.792208
7,0,0.411765,0.530084,0.362209,0.204124,0.333333,0.828571
8,0,0.388889,0.551413,0.324459,0.267261,0.388889,0.801136
11,0,0.200000,0.431476,0.268328,0.000000,0.200000,0.925000
17,0,0.380952,0.472186,0.360060,0.000000,0.380952,0.700000
...,...,...,...,...,...,...,...
101,1,0.615385,0.698932,0.577350,0.717137,0.615385,0.777778
3964,1,0.500000,0.639955,0.566072,0.201008,0.500000,0.888112
333,1,0.529412,0.638391,0.513809,0.462250,0.529412,0.781818
3889,1,0.875000,0.890619,0.865181,0.759072,0.823529,0.944853


## Preprocessing of 50-50 Randomly Removing Majority Class and Outlier Removal for Training Dataset with Other Preprocessing:

In [358]:
subset1SampleOutlier_2 = subset1_clean_2.sample(n=len(subset0_clean_2), random_state=42)
new_train_df_2_clean_randomUnder = subset0_clean_2.append(subset1SampleOutlier_2)
new_train_df_2_clean_randomUnder

Unnamed: 0,classification,lemma_jaccard_score,overall_sim_score,cosine/length_ratio,cosine_similarity_trigrams,jaccard_similarity_score,overlap_score
3,0,0.307692,0.463710,0.290081,0.158114,0.285714,0.792208
7,0,0.312500,0.514166,0.404796,0.119523,0.352941,0.861538
8,0,0.421053,0.590351,0.442105,0.226134,0.450000,0.812030
11,0,0.307692,0.481292,0.284605,0.154303,0.187500,0.966667
17,0,0.315789,0.457632,0.274986,0.000000,0.350000,0.688889
...,...,...,...,...,...,...,...
3761,1,0.692308,0.745518,0.653754,0.714435,0.642857,0.805195
1511,1,0.642857,0.704281,0.622276,0.288675,0.562500,0.833333
725,1,0.571429,0.685243,0.608581,0.223607,0.571429,0.866667
3299,1,0.846154,0.866794,0.833008,0.381385,0.857143,0.934066


## Preprocessing of 50-50 Randomly Removing Majority Class and Outlier Removal for Training Dataset with Other Preprocessing Version 3:

In [359]:
subset1SampleOutlier_3 = subset1_clean_3.sample(n=len(subset0_clean_3), random_state=42)
new_train_df_3_clean_randomUnder = subset0_clean_3.append(subset1SampleOutlier_3)
new_train_df_3_clean_randomUnder

Unnamed: 0,classification,lemma_jaccard_score,overall_sim_score,cosine/length_ratio,cosine_similarity_trigrams,jaccard_similarity_score,overlap_score
3,0,0.307692,0.463710,0.290081,0.158114,0.285714,0.792208
7,0,0.312500,0.514166,0.404796,0.119523,0.352941,0.861538
8,0,0.421053,0.590351,0.442105,0.226134,0.450000,0.812030
11,0,0.307692,0.481292,0.284605,0.154303,0.187500,0.966667
17,0,0.315789,0.457632,0.274986,0.000000,0.350000,0.688889
...,...,...,...,...,...,...,...
3761,1,0.692308,0.745518,0.653754,0.714435,0.642857,0.805195
1511,1,0.642857,0.704281,0.622276,0.288675,0.562500,0.833333
725,1,0.571429,0.685243,0.608581,0.223607,0.571429,0.866667
3299,1,0.846154,0.866794,0.833008,0.381385,0.857143,0.934066


## Preprocessing of 50-50 Randomly Removing Majority Class and Outlier Removal for Missing Training Dataset:

In [360]:
subset1SampleMissingOutlier = subset1_clean_Missing.sample(n=len(subset0_clean_Missing), random_state=42)
missing_clean_randomUnder = subset0_clean_Missing.append(subset1SampleMissingOutlier)
missing_clean_randomUnder

Unnamed: 0,classification,lemma_jaccard_score,overall_sim_score,cosine/length_ratio,cosine_similarity_trigrams,jaccard_similarity_score,overlap_score
3,0.0,0.285714,0.456384,0.290081,0.149071,0.285714,0.792208
7,0.0,0.411765,0.530084,0.362209,0.204124,0.333333,0.828571
10,0.0,0.200000,0.431476,0.268328,0.000000,0.200000,0.925000
15,0.0,0.380952,0.472186,0.360060,0.000000,0.380952,0.700000
19,0.0,0.428571,0.513111,0.341096,0.314270,0.428571,0.631818
...,...,...,...,...,...,...,...
2252,1.0,0.588235,0.690299,0.596285,0.526235,0.588235,0.833333
2108,1.0,0.578947,0.697770,0.643097,0.231455,0.578947,0.901786
1436,1.0,0.764706,0.821609,0.760024,0.385758,0.764706,0.883929
3180,1.0,0.733333,0.822222,0.875000,0.500000,0.733333,1.000000


## N-Grams Features:

In [251]:
from nltk.util import ngrams 
def produce_ngrams(token,n):
    output = list(ngrams(token, n))
    return output
def produce_bigrams(token,n=2):
    output = list(ngrams(token, n))
    return output
def produce_trigrams(token,n=3):
    output = list(ngrams(token, n))
    return output
def produce_quadgrams(token,n=4):
    output = list(ngrams(token, n))
    return output
def jaccard_distance_ngrams(a, b):
    """Calculate the jaccard distance between sets A and B"""
    a = set(a)
    b = set(b)
    return 1.0 * len(a&b)/len(a|b)
def cosine_similarity_ngrams(a, b):
    vec1 = Counter(a)
    vec2 = Counter(b)
    
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    return float(numerator) / denominator
def ngram_similarity(a,b):
    vec1 = Counter(a)
    vec2 = Counter(b)
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = len(intersection)
    denominator = len(vec1) + len(vec2)
    return numerator/denominator

In [275]:
dfs = [train_df,new_train_df,new_train_df_clean,subset0,subset1,train_df2,subset0_2,subset1_2,new_train_df_2,new_train_df_clean_2,missing_train,missing,new_train_df_clean_Missing,new_train_df_Missing,dev_df,new_train_df_clean_randomUnder,new_train_df_2_clean_randomUnder,missing_clean_randomUnder, train_df3,new_train_df_3,new_train_df_clean_3,new_train_df_3_clean_randomUnder]
for df in dfs:
    df['bigrams1'] = list(map(produce_bigrams, df.lemmatized_text1))
    df['bigrams2'] = list(map(produce_bigrams, df.lemmatized_text2))
    df['jaccard_distance_bigrams'] = list(map(jaccard_distance_ngrams, df.bigrams1, df.bigrams2))
    df['cosine_similarity_bigrams'] = list(map(cosine_similarity_ngrams, df.bigrams1, df.bigrams2))
    df['bigram_similarity'] = list(map(ngram_similarity, df.bigrams1, df.bigrams2))
    
    df['trigrams1'] = list(map(produce_trigrams, df.lemmatized_text1))
    df['trigrams2'] = list(map(produce_trigrams, df.lemmatized_text2))
    df['jaccard_distance_trigrams'] = list(map(jaccard_distance_ngrams, df.trigrams1, df.trigrams2))
    df['cosine_similarity_trigrams'] = list(map(cosine_similarity_ngrams, df.trigrams1, df.trigrams2))
    df['trigram_similarity'] = list(map(ngram_similarity, df.trigrams1, df.trigrams2))
    
    df['quadgrams1'] = list(map(produce_quadgrams, df.lemmatized_text1))
    df['quadgrams2'] = list(map(produce_quadgrams, df.lemmatized_text2))
    df['jaccard_distance_quadgrams'] = list(map(jaccard_distance_ngrams, df.quadgrams1, df.quadgrams2))
    df['cosine_similarity_quadgrams'] = list(map(cosine_similarity_ngrams, df.quadgrams1, df.quadgrams2))
    df['quadgram_similarity'] = list(map(ngram_similarity, df.quadgrams1, df.quadgrams2))
    
    df.drop(['bigrams1', 'bigrams2', 'trigrams1', 'trigrams2', 'quadgrams1', 'quadgrams2'], axis=1, inplace=True)

## Adjustments to Datasets:

In [276]:
# Inverting overlap and overlap2 scores:
dfs = [train_df,new_train_df,new_train_df_clean,subset0,subset1,train_df2,subset0_2,subset1_2,new_train_df_2,new_train_df_clean_2,missing_train,missing,new_train_df_clean_Missing,new_train_df_Missing,dev_df,new_train_df_clean_randomUnder,new_train_df_2_clean_randomUnder,missing_clean_randomUnder, train_df3,new_train_df_3,new_train_df_clean_3,new_train_df_3_clean_randomUnder]
for df in dfs:
    df['overlap_score'] = list(map(overlap_score, df.Text_Cleaned1, df.Text_Cleaned2))
    df['overlap2_score'] = list(map(overlap2_score, df.Text_Cleaned1, df.Text_Cleaned2))
    df['overlap_score'] = 1 - df['overlap_score']
    df['overlap2_score'] = 1 - df['overlap2_score']
    #df.drop(['length_similarity', 'cosine_similarity_score2'], axis=1, inplace=True)
    #df.drop(['jaccard_distance_bigrams','cosine_similarity_bigrams','bigram_similarity','jaccard_distance_trigrams','cosine_similarity_trigrams','trigram_similarity','jaccard_distance_quadgrams','cosine_similarity_quadgrams','quadgram_similarity'], axis=1, inplace=True)

In [280]:
# Normalizing Semantic Features:
for df in dfs:
    min_max_scaler = preproc.MinMaxScaler()
    df[['overall_similarity_path_semantic', 'overall_similarity_wup_semantic', 'overall_similarity_combined_semantic']] = min_max_scaler.fit_transform(df[['overall_similarity_path_semantic', 'overall_similarity_wup_semantic', 'overall_similarity_combined_semantic']])

### Development Dataset Pre-Processing:

In [269]:
dev_df = pd.read_csv("../Data/dev_with_label.txt", delimiter = "r'\t", header = None, engine = 'python')
dev_df = dev_df[0].str.split("\t", expand=True)
dev_df = dev_df.rename(columns={0: "id", 1: "sentence1", 2: "sentence2", 3: "classification"})
dev_df["classification"] = pd.to_numeric(dev_df["classification"])
dev_df.drop_duplicates(inplace = True)
dev_df

lemm = nltk.stem.WordNetLemmatizer()

#Syntactic Features:
dev_df['Text_Cleaned1'] = list(map(clean_text, dev_df.sentence1))
dev_df['lemmatized_text1'] = list(map(lambda word:list(map(lemm.lemmatize, word)),dev_df.Text_Cleaned1))
dev_df['Text_Cleaned2'] = list(map(clean_text, dev_df.sentence2))
dev_df['lemmatized_text2'] = list(map(lambda word:list(map(lemm.lemmatize, word)),dev_df.Text_Cleaned2))
dev_df['cosine_similarity_score'] = list(map(counter_cosine_similarity, dev_df.Text_Cleaned1, dev_df.Text_Cleaned2))
dev_df['length_similarity'] = list(map(length_similarity, dev_df.Text_Cleaned1, dev_df.Text_Cleaned2))
dev_df['overlap_score'] = list(map(overlap_score, dev_df.Text_Cleaned1, dev_df.Text_Cleaned2))
dev_df['overlap2_score'] = list(map(overlap2_score, dev_df.Text_Cleaned1, dev_df.Text_Cleaned2))
dev_df['cosine/length_ratio'] = list(map(similarity_score, dev_df.length_similarity, dev_df.cosine_similarity_score))
dev_df['cosine_similarity_score2'] = list(map(compute_cosine_similarity, dev_df.Text_Cleaned1, dev_df.Text_Cleaned2))
dev_df['jaccard_similarity_score'] = list(map(compute_jaccard_similarity, dev_df.Text_Cleaned1, dev_df.Text_Cleaned2))
dev_df['lemma_jaccard_score'] = list(map(compute_lemma_jaccard_similarity, dev_df.lemmatized_text1, dev_df.lemmatized_text2))
dev_df['overall_sim_score'] = list(map(overall_similarity_combined, dev_df.Text_Cleaned1, dev_df.Text_Cleaned2, dev_df.lemmatized_text1, dev_df.lemmatized_text2))

In [271]:
dev_df['scores'] = list(map(semantic_similarities, dev_df.lemmatized_text1, dev_df.lemmatized_text2))
dev_df[['overall_similarity_path_semantic', 'overall_similarity_wup_semantic', 'overall_similarity_combined_semantic']] = pd.DataFrame(dev_df.scores.tolist(), index= dev_df.index)
dev_df.drop(['scores'], axis=1, inplace=True)
dev_df

Unnamed: 0,id,sentence1,sentence2,classification,Text_Cleaned1,lemmatized_text1,Text_Cleaned2,lemmatized_text2,cosine_similarity_score,length_similarity,overlap_score,overlap2_score,cosine/length_ratio,cosine_similarity_score2,jaccard_similarity_score,lemma_jaccard_score,overall_sim_score,overall_similarity_path_semantic,overall_similarity_wup_semantic,overall_similarity_combined_semantic
0,dev_id_0,Local police authorities are treating the expl...,Acting New Haven Police Chief Francisco Ortiz ...,0,"[local, police, authorities, treating, explosi...","[local, police, authority, treating, explosion...","[acting, new, police, chief, francisco, ortiz,...","[acting, new, police, chief, francisco, ortiz,...",0.534522,0.750000,0.138889,0.238095,0.400892,0.890871,0.333333,0.333333,0.519179,0.584580,0.482766,0.526301
1,dev_id_1,The report shows that drugs sold in Canadian p...,The report shows that drugs sold in Canadian p...,1,"[report, shows, drugs, sold, canadian, pharmac...","[report, show, drug, sold, canadian, pharmacy,...","[report, shows, drugs, sold, canadian, pharmac...","[report, show, drug, sold, canadian, pharmacy,...",0.802955,0.823529,0.151261,0.387097,0.661257,0.860309,0.705882,0.705882,0.757358,0.855122,0.803763,0.828098
2,dev_id_2,The transition is slated to begin no later tha...,A two-week transition period will begin no lat...,1,"[transition, slated, begin, later, june, 7, da...","[transition, slated, begin, later, june, 7, da...","[two, week, transition, period, begin, later, ...","[two, week, transition, period, begin, later, ...",0.625000,1.000000,0.000000,0.312500,0.625000,1.000000,0.454545,0.454545,0.636364,0.743750,0.684524,0.713095
3,dev_id_3,"Like Viacom , GE -- parent of NBC -- is also s...","Like Viacom , General Electric is seen as a le...",1,"[like, viacom, ge, parent, nbc, also, seen, le...","[like, viacom, ge, parent, nbc, also, seen, le...","[like, viacom, general, electric, seen, less, ...","[like, viacom, general, electric, seen, le, en...",0.725241,0.785714,0.175325,0.360000,0.569832,0.886405,0.562500,0.600000,0.682968,0.796260,0.749333,0.770524
4,dev_id_4,"Last month , 62 Spanish peacekeepers died when...","In another disaster , 62 Spanish peacekeepers ...",1,"[last, month, 62, spanish, peacekeepers, died,...","[last, month, 62, spanish, peacekeeper, died, ...","[another, disaster, 62, spanish, peacekeepers,...","[another, disaster, 62, spanish, peacekeeper, ...",0.585369,0.846154,0.097902,0.291667,0.495313,0.919866,0.411765,0.411765,0.581132,0.680159,0.603472,0.641237
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
719,dev_id_719,"He is a brother to three-year-old Mia , from K...","Winslet , 28 , has a three-year-old daughter M...",0,"[brother, three, year, old, mia, kate, first, ...","[brother, three, year, old, mia, kate, first, ...","[winslet, 28, three, year, old, daughter, mia,...","[winslet, 28, three, year, old, daughter, mia,...",0.694365,0.857143,0.107143,0.346154,0.595170,0.925820,0.529412,0.529412,0.661548,0.638645,0.598718,0.614469
720,dev_id_720,Some 175 million shares traded on the Big Boar...,Some 1.6 billion shares traded on the Big Boar...,0,"[175, million, shares, traded, big, board, 7, ...","[175, million, share, traded, big, board, 7, p...","[1, 6, billion, shares, traded, big, board, 17...","[1, 6, billion, share, traded, big, board, 17,...",0.462910,0.857143,0.071429,0.230769,0.396780,0.925820,0.300000,0.300000,0.508607,0.720496,0.592949,0.656723
721,dev_id_721,Mr Berlusconi is accused of bribing judges to ...,Mr Berlusconi is accused of bribing judges to ...,1,"[mr, berlusconi, accused, bribing, judges, inf...","[mr, berlusconi, accused, bribing, judge, infl...","[mr, berlusconi, accused, bribing, judges, inf...","[mr, berlusconi, accused, bribing, judge, infl...",0.716115,0.866667,0.102564,0.357143,0.620633,0.930949,0.555556,0.555556,0.680687,0.717347,0.706633,0.711990
722,dev_id_722,"He added that those "" are not solely American ...",""" These are not solely American principles nor...",1,"[added, solely, american, principles, exclusiv...","[added, solely, american, principle, exclusive...","[solely, american, principles, exclusively, we...","[solely, american, principle, exclusively, wes...",0.771517,0.857143,0.119048,0.384615,0.661300,0.925820,0.625000,0.625000,0.725273,0.813187,0.794872,0.804029


In [272]:
# dev_df['overlap_score'] = list(map(overlap_score, dev_df.Text_Cleaned1, dev_df.Text_Cleaned2))
# dev_df['overlap2_score'] = list(map(overlap2_score, dev_df.Text_Cleaned1, dev_df.Text_Cleaned2))
# dev_df['overlap_score'] = 1 - dev_df['overlap_score']
# dev_df['overlap2_score'] = 1 - dev_df['overlap2_score']
# dev_df.drop(['length_similarity', 'cosine_similarity_score2'], axis=1, inplace=True)
# dev_df

Unnamed: 0,id,sentence1,sentence2,classification,Text_Cleaned1,lemmatized_text1,Text_Cleaned2,lemmatized_text2,cosine_similarity_score,overlap_score,overlap2_score,cosine/length_ratio,jaccard_similarity_score,lemma_jaccard_score,overall_sim_score,overall_similarity_path_semantic,overall_similarity_wup_semantic,overall_similarity_combined_semantic
0,dev_id_0,Local police authorities are treating the expl...,Acting New Haven Police Chief Francisco Ortiz ...,0,"[local, police, authorities, treating, explosi...","[local, police, authority, treating, explosion...","[acting, new, police, chief, francisco, ortiz,...","[acting, new, police, chief, francisco, ortiz,...",0.534522,0.861111,0.761905,0.400892,0.333333,0.333333,0.519179,0.584580,0.482766,0.526301
1,dev_id_1,The report shows that drugs sold in Canadian p...,The report shows that drugs sold in Canadian p...,1,"[report, shows, drugs, sold, canadian, pharmac...","[report, show, drug, sold, canadian, pharmacy,...","[report, shows, drugs, sold, canadian, pharmac...","[report, show, drug, sold, canadian, pharmacy,...",0.802955,0.848739,0.612903,0.661257,0.705882,0.705882,0.757358,0.855122,0.803763,0.828098
2,dev_id_2,The transition is slated to begin no later tha...,A two-week transition period will begin no lat...,1,"[transition, slated, begin, later, june, 7, da...","[transition, slated, begin, later, june, 7, da...","[two, week, transition, period, begin, later, ...","[two, week, transition, period, begin, later, ...",0.625000,1.000000,0.687500,0.625000,0.454545,0.454545,0.636364,0.743750,0.684524,0.713095
3,dev_id_3,"Like Viacom , GE -- parent of NBC -- is also s...","Like Viacom , General Electric is seen as a le...",1,"[like, viacom, ge, parent, nbc, also, seen, le...","[like, viacom, ge, parent, nbc, also, seen, le...","[like, viacom, general, electric, seen, less, ...","[like, viacom, general, electric, seen, le, en...",0.725241,0.824675,0.640000,0.569832,0.562500,0.600000,0.682968,0.796260,0.749333,0.770524
4,dev_id_4,"Last month , 62 Spanish peacekeepers died when...","In another disaster , 62 Spanish peacekeepers ...",1,"[last, month, 62, spanish, peacekeepers, died,...","[last, month, 62, spanish, peacekeeper, died, ...","[another, disaster, 62, spanish, peacekeepers,...","[another, disaster, 62, spanish, peacekeeper, ...",0.585369,0.902098,0.708333,0.495313,0.411765,0.411765,0.581132,0.680159,0.603472,0.641237
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
719,dev_id_719,"He is a brother to three-year-old Mia , from K...","Winslet , 28 , has a three-year-old daughter M...",0,"[brother, three, year, old, mia, kate, first, ...","[brother, three, year, old, mia, kate, first, ...","[winslet, 28, three, year, old, daughter, mia,...","[winslet, 28, three, year, old, daughter, mia,...",0.694365,0.892857,0.653846,0.595170,0.529412,0.529412,0.661548,0.638645,0.598718,0.614469
720,dev_id_720,Some 175 million shares traded on the Big Boar...,Some 1.6 billion shares traded on the Big Boar...,0,"[175, million, shares, traded, big, board, 7, ...","[175, million, share, traded, big, board, 7, p...","[1, 6, billion, shares, traded, big, board, 17...","[1, 6, billion, share, traded, big, board, 17,...",0.462910,0.928571,0.769231,0.396780,0.300000,0.300000,0.508607,0.720496,0.592949,0.656723
721,dev_id_721,Mr Berlusconi is accused of bribing judges to ...,Mr Berlusconi is accused of bribing judges to ...,1,"[mr, berlusconi, accused, bribing, judges, inf...","[mr, berlusconi, accused, bribing, judge, infl...","[mr, berlusconi, accused, bribing, judges, inf...","[mr, berlusconi, accused, bribing, judge, infl...",0.716115,0.897436,0.642857,0.620633,0.555556,0.555556,0.680687,0.717347,0.706633,0.711990
722,dev_id_722,"He added that those "" are not solely American ...",""" These are not solely American principles nor...",1,"[added, solely, american, principles, exclusiv...","[added, solely, american, principle, exclusive...","[solely, american, principles, exclusively, we...","[solely, american, principle, exclusively, wes...",0.771517,0.880952,0.615385,0.661300,0.625000,0.625000,0.725273,0.813187,0.794872,0.804029


## Exporting Pre-Processed Data Files:

In [361]:
train_df.to_csv('../Data2/train_df.csv', index = False)
new_train_df.to_csv('../Data2/train_df_randomUnder.csv', index = False)
new_train_df_clean.to_csv('../Data2/train_df_no_outliers.csv', index = False)
subset0.to_csv('../Data2/subset0.csv', index = False)
subset1.to_csv('../Data2/subset1.csv', index = False)
train_df2.to_csv('../Data2/train_df2.csv', index = False)
subset0_2.to_csv('../Data2/subset0_2.csv', index = False)
subset1_2.to_csv('../Data2/subset1_2.csv', index = False)
new_train_df_2.to_csv('../Data2/train_df2_randomUnder.csv', index = False)
new_train_df_clean_2.to_csv('../Data2/train_df2_no_outliers.csv', index = False)
missing_train.to_csv('../Data2/missing_train.csv', index = False)
missing.to_csv('../Data2/Missing.csv', index = False)
new_train_df_clean_Missing.to_csv('../Data2/missing_train_df_no_outliers.csv', index = False)
new_train_df_Missing.to_csv('../Data2/missing_train_df_randomUnder.csv', index = False)
dev_df.to_csv('../Data2/dev_df.csv', index = False)

new_train_df_clean_randomUnder.to_csv('../Data2/train_df_no_outliers_randomUnder.csv', index = False)
new_train_df_2_clean_randomUnder.to_csv('../Data2/train_df2_no_outliers_randomUnder.csv', index = False)
missing_clean_randomUnder.to_csv('../Data2/missing_train_df_no_outliers_randomUnder.csv', index = False)
train_df3.to_csv('../Data2/train_df_3.csv', index = False)
new_train_df_3.to_csv('../Data2/train_df3_randomUnder.csv', index = False)
new_train_df_clean_3.to_csv('../Data2/train_df3_no_outliers.csv', index = False)
new_train_df_3_clean_randomUnder.to_csv('../Data2/train_df3_no_outliers_randomUnder.csv', index = False)

In [312]:
train_df = pd.read_csv('../Data/train_df.csv')
new_train_df = pd.read_csv('../Data/train_df_randomUnder.csv')
new_train_df_clean = pd.read_csv('../Data/train_df_no_outliers.csv')
subset0 = pd.read_csv('../Data/subset0.csv')
subset1 = pd.read_csv('../Data/subset1.csv')
train_df2 = pd.read_csv('../Data/train_df2.csv')
subset0_2 = pd.read_csv('../Data/subset0_2.csv')
subset1_2 = pd.read_csv('../Data/subset1_2.csv')
new_train_df_2 = pd.read_csv('../Data/train_df2_randomUnder.csv')
new_train_df_clean_2 = pd.read_csv('../Data/train_df2_no_outliers.csv')
missing_train = pd.read_csv('../Data/missing_train.csv')
new_train_df_clean_Missing = pd.read_csv('../Data/missing_train_df_no_outliers.csv')
new_train_df_Missing = pd.read_csv('../Data/missing_train_df_randomUnder.csv')
dev_df = pd.read_csv('../Data/dev_df.csv')

new_train_df_clean_randomUnder = pd.read_csv('../Data/train_df_no_outliers_randomUnder.csv')
new_train_df_2_clean_randomUnder = pd.read_csv('../Data/train_df2_no_outliers_randomUnder.csv')
missing_clean_randomUnder = pd.read_csv('../Data/missing_train_df_no_outliers_randomUnder.csv')
train_df3 = pd.read_csv('../Data/train_df_3.csv')
new_train_df_3 = pd.read_csv('../Data/train_df3_randomUnder.csv')
new_train_df_clean_3 = pd.read_csv('../Data/train_df3_no_outliers.csv')
new_train_df_3_clean_randomUnder = pd.read_csv('../Data/train_df3_no_outliers_randomUnder.csv')

In [314]:
train_df = train_df[['classification','lemma_jaccard_score', 'overall_sim_score', 'cosine/length_ratio', 'cosine_similarity_trigrams', 'jaccard_similarity_score', 'overlap_score']]
new_train_df = new_train_df[['classification','lemma_jaccard_score', 'overall_sim_score', 'cosine/length_ratio', 'cosine_similarity_trigrams', 'jaccard_similarity_score', 'overlap_score']]
new_train_df_clean = new_train_df_clean[['classification','lemma_jaccard_score', 'overall_sim_score', 'cosine/length_ratio', 'cosine_similarity_trigrams', 'jaccard_similarity_score', 'overlap_score']]
subset0 = subset0[['classification','lemma_jaccard_score', 'overall_sim_score', 'cosine/length_ratio', 'cosine_similarity_trigrams', 'jaccard_similarity_score', 'overlap_score']]
subset1 = subset1[['classification','lemma_jaccard_score', 'overall_sim_score', 'cosine/length_ratio', 'cosine_similarity_trigrams', 'jaccard_similarity_score', 'overlap_score']]
train_df2 = train_df2[['classification','lemma_jaccard_score', 'overall_sim_score', 'cosine/length_ratio', 'cosine_similarity_trigrams', 'jaccard_similarity_score', 'overlap_score']]
subset0_2 = subset0_2[['classification','lemma_jaccard_score', 'overall_sim_score', 'cosine/length_ratio', 'cosine_similarity_trigrams', 'jaccard_similarity_score', 'overlap_score']]
subset1_2 = subset1_2[['classification','lemma_jaccard_score', 'overall_sim_score', 'cosine/length_ratio', 'cosine_similarity_trigrams', 'jaccard_similarity_score', 'overlap_score']]
new_train_df_2 = new_train_df_2[['classification','lemma_jaccard_score', 'overall_sim_score', 'cosine/length_ratio', 'cosine_similarity_trigrams', 'jaccard_similarity_score', 'overlap_score']]
new_train_df_clean_2 = new_train_df_clean_2[['classification','lemma_jaccard_score', 'overall_sim_score', 'cosine/length_ratio', 'cosine_similarity_trigrams', 'jaccard_similarity_score', 'overlap_score']]
missing_train = missing_train[['classification','lemma_jaccard_score', 'overall_sim_score', 'cosine/length_ratio', 'cosine_similarity_trigrams', 'jaccard_similarity_score', 'overlap_score']]
new_train_df_clean_Missing = new_train_df_clean_Missing[['classification','lemma_jaccard_score', 'overall_sim_score', 'cosine/length_ratio', 'cosine_similarity_trigrams', 'jaccard_similarity_score', 'overlap_score']]
new_train_df_Missing = new_train_df_Missing[['classification','lemma_jaccard_score', 'overall_sim_score', 'cosine/length_ratio', 'cosine_similarity_trigrams', 'jaccard_similarity_score', 'overlap_score']]
dev_df = dev_df[['classification','lemma_jaccard_score', 'overall_sim_score', 'cosine/length_ratio', 'cosine_similarity_trigrams', 'jaccard_similarity_score', 'overlap_score']]

new_train_df_clean_randomUnder = new_train_df_clean_randomUnder[['classification','lemma_jaccard_score', 'overall_sim_score', 'cosine/length_ratio', 'cosine_similarity_trigrams', 'jaccard_similarity_score', 'overlap_score']]
new_train_df_2_clean_randomUnder = new_train_df_2_clean_randomUnder[['classification','lemma_jaccard_score', 'overall_sim_score', 'cosine/length_ratio', 'cosine_similarity_trigrams', 'jaccard_similarity_score', 'overlap_score']]
missing_clean_randomUnder = missing_clean_randomUnder[['classification','lemma_jaccard_score', 'overall_sim_score', 'cosine/length_ratio', 'cosine_similarity_trigrams', 'jaccard_similarity_score', 'overlap_score']]
train_df3 = train_df3[['classification','lemma_jaccard_score', 'overall_sim_score', 'cosine/length_ratio', 'cosine_similarity_trigrams', 'jaccard_similarity_score', 'overlap_score']]
new_train_df_3 = new_train_df_3[['classification','lemma_jaccard_score', 'overall_sim_score', 'cosine/length_ratio', 'cosine_similarity_trigrams', 'jaccard_similarity_score', 'overlap_score']]
new_train_df_clean_3 = new_train_df_clean_3[['classification','lemma_jaccard_score', 'overall_sim_score', 'cosine/length_ratio', 'cosine_similarity_trigrams', 'jaccard_similarity_score', 'overlap_score']]
new_train_df_3_clean_randomUnder = new_train_df_3_clean_randomUnder[['classification','lemma_jaccard_score', 'overall_sim_score', 'cosine/length_ratio', 'cosine_similarity_trigrams', 'jaccard_similarity_score', 'overlap_score']]

In [363]:
train_df.columns

Index(['classification', 'lemma_jaccard_score', 'overall_sim_score',
       'cosine/length_ratio', 'cosine_similarity_trigrams',
       'jaccard_similarity_score', 'overlap_score'],
      dtype='object')

In [365]:
type(train_df['lemmatized_text1'][0])

list

In [366]:
train_df

Unnamed: 0,id,sentence1,sentence2,classification,Text_Cleaned1,lemmatized_text1,Text_Cleaned2,lemmatized_text2,cosine_similarity_score,length_similarity,overlap_score,overlap2_score,cosine/length_ratio,cosine_similarity_score2,jaccard_similarity_score,lemma_jaccard_score,overall_sim_score
0,train_id_0,The Democratic candidates also began announcin...,The Democratic candidates also began announcin...,1,"[democratic, candidates, also, began, announci...","[democratic, candidate, also, began, announcin...","[democratic, candidates, also, began, announci...","[democratic, candidate, also, began, announcin...",0.909509,0.941176,0.055147,0.454545,0.856008,0.970143,0.833333,0.833333,0.878936
1,train_id_1,The woman was exposed to the SARS virus while ...,The woman was exposed to the SARS virus while ...,1,"[woman, exposed, sars, virus, hospital, health...","[woman, exposed, sars, virus, hospital, health...","[woman, exposed, sars, virus, hospital, health...","[woman, exposed, sars, virus, hospital, health...",0.904534,0.888889,0.111111,0.470588,0.804030,0.954786,0.937500,0.937500,0.943262
2,train_id_2,He said the problem needs to be corrected befo...,He said the prob lem needs to be corrected bef...,1,"[said, problem, needs, corrected, space, shutt...","[said, problem, need, corrected, space, shuttl...","[said, prob, lem, needs, corrected, space, shu...","[said, prob, lem, need, corrected, space, shut...",0.666667,1.000000,0.000000,0.333333,0.666667,1.000000,0.500000,0.636364,0.712121
3,train_id_3,A representative for Phoenix-based U-Haul decl...,"Anthony Citrano , a representative for WhenU ,...",0,"[representative, phoenix, based, u, haul, decl...","[representative, phoenix, based, u, haul, decl...","[anthony, citrano, representative, whenu, decl...","[anthony, citrano, representative, whenu, decl...",0.455842,0.636364,0.207792,0.222222,0.290081,0.797724,0.285714,0.285714,0.456384
4,train_id_4,The biggest threat to order seemed to be looti...,The biggest threat to order seemed to be looti...,1,"[biggest, threat, order, seemed, looting, crim...","[biggest, threat, order, seemed, looting, crim...","[biggest, threat, order, seemed, looting, crim...","[biggest, threat, order, seemed, looting, crim...",0.721688,0.750000,0.208333,0.357143,0.541266,0.866025,0.555556,0.555556,0.659046
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4072,train_id_4072,"Axelrod died in his sleep of heart failure , s...",Axelrod died of heart failure while asleep at ...,1,"[axelrod, died, sleep, heart, failure, said, d...","[axelrod, died, sleep, heart, failure, said, d...","[axelrod, died, heart, failure, asleep, los, a...","[axelrod, died, heart, failure, asleep, los, a...",0.805823,0.750000,0.222222,0.380952,0.604367,0.886405,0.583333,0.583333,0.684357
4073,train_id_4073,"Saddam 's other son , Odai , surrendered Frida...","Hussein 's other son , Uday , surrendered yest...",1,"[saddam, son, odai, surrendered, friday, ameri...","[saddam, son, odai, surrendered, friday, ameri...","[hussein, son, uday, surrendered, yesterday, a...","[hussein, son, uday, surrendered, yesterday, a...",0.600000,1.000000,0.000000,0.300000,0.600000,1.000000,0.428571,0.538462,0.655678
4074,train_id_4074,If Senator Clinton does decide to run in 2008 ...,If Mrs Clinton does decide to contest the 2008...,1,"[senator, clinton, decide, run, 2008, cannot, ...","[senator, clinton, decide, run, 2008, cannot, ...","[mrs, clinton, decide, contest, 2008, election...","[mr, clinton, decide, contest, 2008, election,...",0.819892,0.923077,0.064103,0.400000,0.756823,0.968963,0.714286,0.714286,0.799178
4075,train_id_4075,"The Iranian refugee who sewed up his eyes , li...","An Iranian Kurd who stitched up his eyes , lip...",1,"[iranian, refugee, sewed, eyes, lips, ears, pr...","[iranian, refugee, sewed, eye, lip, ear, prote...","[iranian, kurd, stitched, eyes, lips, ears, pr...","[iranian, kurd, stitched, eye, lip, ear, prote...",0.560449,0.923077,0.044872,0.280000,0.517337,0.960769,0.388889,0.388889,0.579516
