In [442]:
# mount to google drive
# from google.colab import drive
# drive.mount('/content/gdrive')

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

from sklearn import svm
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from scipy.spatial import distance

Mounted at /content/gdrive


In [444]:


def readIn(filePath, train):
  with open(filePath) as f:
      lines = f.readlines()
  

  e = pd.DataFrame({"sample": lines})


  e["id"] = e["sample"].str.split('\t').str[0]
  e["sentence_1"] = e["sample"].str.split('\t').str[1]
  e["sentence_2"] = e["sample"].str.split('\t').str[2]

  # PUNCTUATION
  e["sentence_1"] = e["sentence_1"].str.replace(r'[^\w\s]', '', regex=True)
  e["sentence_2"] = e["sentence_2"].str.replace(r'[^\w\s]', '', regex=True)

  # STRIP
  e["sentence_1"] = e["sentence_1"].str.strip()
  e["sentence_2"] = e["sentence_2"].str.strip()

  if(train):
    e["golden_label"] = e["sample"].str.split('\t').str[3]
    e["golden_label"] = e["golden_label"].str.split('\n').str[0]
    e["golden_label"] = pd.to_numeric(e["golden_label"])
    e = e.filter(['id','sentence_1', 'sentence_2', 'golden_label'], axis=1)
    return e

  e = e.filter(['id','sentence_1', 'sentence_2'], axis=1)
  return e


trainS = readIn('/content/sample_data/midterm/train_with_label.txt', True)
devS = readIn('/content/sample_data/midterm/dev_with_label.txt', True)
testS = readIn('/content/sample_data/midterm/test_without_label.txt', False)


In [445]:

# https://nikoskalikis.medium.com/text-similarity-euclidian-distance-vs-cosine-similarity-3a1167f686a
# Euclidean Distance
def euclideanDistance(sentence1, sentence2):

    texts = [sentence1, sentence2]
    count_vectorizer = CountVectorizer(stop_words='english')
    count_vectorizer = CountVectorizer()
    matrix = count_vectorizer.fit_transform(texts)

    table = matrix.todense()
    df = pd.DataFrame(table, 
                      columns=count_vectorizer.get_feature_names(), 
                      index=['sentence_1', 'sentence_2'])
    
    dist = distance.euclidean(df.iloc[0].to_list(), df.iloc[1].to_list())

    return dist

# Cosine similarity
def cosineSim(sentence1, sentence2):
  texts = [sentence1, sentence2]
  count_vectorizer = CountVectorizer(stop_words='english')
  count_vectorizer = CountVectorizer()
  matrix = count_vectorizer.fit_transform(texts)

  table = matrix.todense()
  df = pd.DataFrame(table, columns=count_vectorizer.get_feature_names(), index=['sentence_1', 'sentence_2'])
  
  values = cosine_similarity(df, df)
  return values[0][1]


# Containment value based on n-gram value
def calculateContainment(sentence1, sentence2, n):

    counts = CountVectorizer(analyzer='word', ngram_range=(n,n))
    ngrams = counts.fit_transform([sentence1, sentence2])
    ngram_array = ngrams.toarray()
    intersection_list = np.amin(ngram_array, axis=0)
    intersection = np.sum(intersection_list)

    answer_idx = 0
    answer_cnt = np.sum(ngram_array[answer_idx])

    containment_val =  intersection / answer_cnt

    return containment_val


# LCS
def lcsNorm(sentence1, sentence2):
    a_text = sentence1.split()
    s_text = sentence2.split()
    
    n = len(a_text)
    m = len(s_text)
    
    # create an m x n matrix
    matrix_lcs = np.zeros((m+1,n+1), dtype=int)
    
    # iterate through each word in the source text looking for a match against the answer text
    for i, s_word in enumerate(s_text, start=1):
        for j, a_word in enumerate(a_text, start=1):
            # match: diagonal addition
            if a_word == s_word:
                matrix_lcs[i][j] = matrix_lcs[i-1][j-1] + 1
            else:
            # no match: max of top/left values
                matrix_lcs[i][j] = max(matrix_lcs[i-1][j], matrix_lcs[i][j-1])
    
    # normalize lcs = (last value in the m x n matrix) / (length of the answer text)
    normalized_lcs = matrix_lcs[m][n] / n

    return normalized_lcs

# Minimum editing distance
def minimumEditDistance(sentence1, sentence2):
    lensum = float(len(sentence1) + len(sentence2))
    if len(sentence1) > len(sentence2):
        sentence1, sentence2 = sentence2, sentence1
    distances = range(len(sentence1) + 1)
    for index2, char2 in enumerate(sentence2):
        # sentence2 > sentence1
        newDistances = [index2 + 1]
        for index1, char1 in enumerate(sentence1):
            if char1 == char2:
                newDistances.append(distances[index1])
            else:
                newDistances.append(1 + min((distances[index1],  # delete
                                             distances[index1 + 1],  # insert
                                             newDistances[-1])))  # exchange
        distances = newDistances
    mindist = distances[-1]
    ratio = (lensum - mindist) / lensum
    return ratio

# https://towardsdatascience.com/identifying-duplicate-questions-on-quora-top-12-on-kaggle-4c1cf93f1c30
# words shared between two sentences
def normalized_word_common(row):
    w1 = set(map(lambda word: word.lower().strip(), row['sentence_1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['sentence_2'].split(" ")))    
    return 1.0 * len(w1 & w2)

# words in total between two sentences
def normalized_word_total(row):
    w1 = set(map(lambda word: word.lower().strip(), row['sentence_1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['sentence_2'].split(" ")))    
    return 1.0 * (len(w1) + len(w2))

# words shared / total words 
def normalized_word_share(row):
    w1 = set(map(lambda word: word.lower().strip(), row['sentence_1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['sentence_2'].split(" ")))    
    return 1.0 * len(w1 & w2)/(len(w1) + len(w2))



In [446]:
def createFeatures(data_f, train):
  features = pd.DataFrame()
  if(train):
    features["golden_label"] = data_f["golden_label"]

  features['sentence1_len'] = data_f.apply(lambda x: len(x.sentence_1), axis=1)
  features['sentence2_len'] = data_f.apply(lambda x: len(x.sentence_2), axis=1)
  features['sentence_len_diff'] = data_f.apply(lambda x: abs(len(x.sentence_1)-len(x.sentence_2)), axis=1)

  features['sentence1_words'] = data_f['sentence_1'].apply(lambda row: len(row.split(" ")))
  features['sentence2_words'] = data_f['sentence_2'].apply(lambda row: len(row.split(" ")))

  features['word_common'] = data_f.apply(lambda x: normalized_word_common(x), axis=1)
  features['word_total'] = data_f.apply(lambda x: normalized_word_total(x), axis=1)
  features['word_share'] = features.apply(lambda x: x.word_common / x.word_total, axis=1)

  features['minimumED'] = data_f.apply(lambda x: minimumEditDistance(x.sentence_1, x.sentence_2), axis=1)
  features['euclid_dist'] = data_f.apply(lambda x: euclideanDistance(x.sentence_1, x.sentence_2), axis=1)
  features['cos_sim'] = data_f.apply(lambda x: cosineSim(x.sentence_1, x.sentence_2), axis=1)
  features['lcsNorm'] = data_f.apply(lambda x: lcsNorm(x.sentence_1, x.sentence_2), axis=1)


  n_range = 6
  for i in range(1, n_range):
      column_name = "containment_" + str(i)
      print(column_name)
      features[column_name] = data_f.apply(lambda x: calculateContainment(x.sentence_1, x.sentence_2, i), axis=1)
  return features



In [447]:
# Create features and display
trainFeatures = createFeatures(trainS, True)
devFeatures = createFeatures(devS, True)
testFeatures = createFeatures(testS, False)

display(trainFeatures)
display(devFeatures)
display(testFeatures)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.


Function get_fea

containment_1
containment_2
containment_3
containment_4
containment_5



Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 

containment_1
containment_2
containment_3
containment_4
containment_5



Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 

containment_1
containment_2
containment_3
containment_4
containment_5


Unnamed: 0,golden_label,sentence1_len,sentence2_len,sentence_len_diff,sentence1_words,sentence2_words,word_common,word_total,word_share,minimumED,euclid_dist,cos_sim,lcsNorm,containment_1,containment_2,containment_3,containment_4,containment_5
0,1,163,168,5,22,24,18.0,43.0,0.418605,0.927492,2.645751,0.875595,0.863636,0.904762,0.800000,0.736842,0.666667,0.588235
1,1,160,157,3,34,31,26.0,54.0,0.481481,0.990536,1.732051,0.958603,0.933333,0.925926,0.884615,0.840000,0.791667,0.739130
2,1,96,85,11,18,16,12.0,31.0,0.387097,0.911602,2.828427,0.804030,0.722222,0.722222,0.588235,0.500000,0.400000,0.357143
3,0,111,76,35,18,14,9.0,30.0,0.300000,0.620321,3.316625,0.639602,0.444444,0.500000,0.400000,0.285714,0.230769,0.166667
4,1,135,157,22,25,29,18.0,46.0,0.391304,0.842466,4.000000,0.778981,0.791667,0.791667,0.695652,0.681818,0.666667,0.650000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4072,1,75,99,24,15,19,10.0,28.0,0.357143,0.787356,2.828427,0.793884,0.769231,0.846154,0.583333,0.363636,0.200000,0.111111
4073,1,108,112,4,23,23,17.0,40.0,0.425000,0.931818,2.449490,0.823529,0.850000,0.823529,0.687500,0.600000,0.571429,0.538462
4074,1,143,153,10,27,28,22.0,50.0,0.440000,0.915541,2.645751,0.878410,0.880000,0.880000,0.750000,0.695652,0.636364,0.571429
4075,1,142,122,20,29,23,15.0,48.0,0.312500,0.768939,5.099020,0.587957,0.464286,0.500000,0.296296,0.269231,0.240000,0.208333


Unnamed: 0,golden_label,sentence1_len,sentence2_len,sentence_len_diff,sentence1_words,sentence2_words,word_common,word_total,word_share,minimumED,euclid_dist,cos_sim,lcsNorm,containment_1,containment_2,containment_3,containment_4,containment_5
0,0,103,106,3,17,17,8.0,33.0,0.242424,0.631579,4.242641,0.471405,0.470588,0.437500,0.333333,0.285714,0.230769,0.166667
1,1,145,180,35,24,30,18.0,46.0,0.391304,0.846154,3.464102,0.831632,0.826087,0.863636,0.809524,0.750000,0.736842,0.722222
2,1,67,59,8,14,11,7.0,25.0,0.280000,0.738095,3.000000,0.577350,0.538462,0.500000,0.363636,0.300000,0.222222,0.125000
3,1,117,99,18,25,17,14.0,39.0,0.358974,0.847222,3.741657,0.646058,0.590909,0.571429,0.400000,0.263158,0.166667,0.117647
4,1,103,116,13,18,21,12.0,37.0,0.324324,0.789954,3.872983,0.643596,0.588235,0.647059,0.437500,0.333333,0.214286,0.153846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
719,0,97,104,7,19,19,9.0,34.0,0.264706,0.776119,4.242641,0.438357,0.470588,0.466667,0.214286,0.153846,0.083333,0.000000
720,0,99,103,4,21,19,11.0,37.0,0.297297,0.871287,3.741657,0.631579,0.550000,0.588235,0.375000,0.266667,0.214286,0.153846
721,1,112,127,15,19,22,14.0,38.0,0.368421,0.799163,3.162278,0.765384,0.684211,0.736842,0.500000,0.411765,0.375000,0.333333
722,1,89,89,0,16,15,10.0,27.0,0.370370,0.808989,2.645751,0.774597,0.714286,0.714286,0.692308,0.666667,0.636364,0.600000


Unnamed: 0,sentence1_len,sentence2_len,sentence_len_diff,sentence1_words,sentence2_words,word_common,word_total,word_share,minimumED,euclid_dist,cos_sim,lcsNorm,containment_1,containment_2,containment_3,containment_4,containment_5
0,81,114,33,14,18,11.0,28.0,0.392857,0.758974,2.449490,0.804400,0.636364,0.909091,0.600000,0.333333,0.250000,0.142857
1,172,166,6,25,27,19.0,48.0,0.395833,0.893491,3.000000,0.852574,0.840000,0.833333,0.739130,0.681818,0.619048,0.550000
2,141,121,20,25,22,12.0,40.0,0.300000,0.744275,4.000000,0.619751,0.571429,0.550000,0.368421,0.166667,0.058824,0.000000
3,64,59,5,13,12,5.0,24.0,0.208333,0.560976,3.605551,0.381385,0.363636,0.363636,0.300000,0.222222,0.125000,0.000000
4,93,126,33,18,25,13.0,39.0,0.333333,0.803653,3.872983,0.712052,0.705882,0.764706,0.562500,0.400000,0.214286,0.076923
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,78,74,4,16,15,6.0,28.0,0.214286,0.657895,3.741657,0.418121,0.384615,0.454545,0.200000,0.111111,0.000000,0.000000
996,62,96,34,10,16,7.0,25.0,0.280000,0.702532,3.605551,0.536875,0.600000,0.700000,0.333333,0.125000,0.000000,0.000000
997,124,114,10,23,23,10.0,36.0,0.277778,0.806723,4.242641,0.625000,0.550000,0.550000,0.421053,0.388889,0.352941,0.312500
998,90,87,3,18,15,7.0,29.0,0.241379,0.706215,3.741657,0.562500,0.466667,0.500000,0.230769,0.083333,0.000000,0.000000


In [448]:

# Create correlation matrix for just Features to determine different models to test
corr_matrix = trainFeatures.corr().abs().round(2)

# display shows all of a dataframe
display(corr_matrix)

Unnamed: 0,golden_label,sentence1_len,sentence2_len,sentence_len_diff,sentence1_words,sentence2_words,word_common,word_total,word_share,minimumED,euclid_dist,cos_sim,lcsNorm,containment_1,containment_2,containment_3,containment_4,containment_5
golden_label,1.0,0.17,0.17,0.15,0.15,0.15,0.32,0.17,0.4,0.34,0.32,0.39,0.33,0.36,0.28,0.23,0.19,0.17
sentence1_len,0.17,1.0,0.75,0.12,0.9,0.65,0.76,0.85,0.46,0.4,0.05,0.47,0.31,0.27,0.37,0.39,0.37,0.36
sentence2_len,0.17,0.75,1.0,0.12,0.66,0.9,0.76,0.86,0.45,0.4,0.06,0.47,0.55,0.54,0.53,0.51,0.49,0.46
sentence_len_diff,0.15,0.12,0.12,1.0,0.1,0.1,0.05,0.11,0.01,0.18,0.15,0.03,0.07,0.02,0.08,0.1,0.11,0.11
sentence1_words,0.15,0.9,0.66,0.1,1.0,0.71,0.8,0.88,0.5,0.41,0.03,0.5,0.33,0.28,0.39,0.41,0.4,0.39
sentence2_words,0.15,0.65,0.9,0.1,0.71,1.0,0.8,0.88,0.5,0.41,0.04,0.49,0.6,0.58,0.57,0.55,0.52,0.5
word_common,0.32,0.76,0.76,0.05,0.8,0.8,1.0,0.88,0.85,0.62,0.38,0.8,0.73,0.75,0.76,0.72,0.69,0.65
word_total,0.17,0.85,0.86,0.11,0.88,0.88,0.88,1.0,0.52,0.45,0.05,0.5,0.49,0.45,0.5,0.5,0.48,0.46
word_share,0.4,0.46,0.45,0.01,0.5,0.5,0.85,0.52,1.0,0.63,0.72,0.94,0.8,0.88,0.84,0.77,0.72,0.67
minimumED,0.34,0.4,0.4,0.18,0.41,0.41,0.62,0.45,0.63,1.0,0.4,0.61,0.73,0.57,0.63,0.61,0.58,0.55


In [449]:
selected_features = ["golden_label","lcsNorm", "minimumED", "containment_1","containment_3","sentence1_len", "sentence2_len", "sentence_len_diff", "euclid_dist", "cos_sim", "word_common", "word_total", "word_share", "sentence1_words", "sentence2_words"]
selected_featuresTest = ["lcsNorm", "minimumED", "containment_1","containment_3","sentence1_len", "sentence2_len", "sentence_len_diff", "euclid_dist", "cos_sim", "word_common", "word_total", "word_share", "sentence1_words", "sentence2_words"]

trainFeaturesU = trainFeatures.filter(selected_features, axis=1)
devFeaturesU = devFeatures.filter(selected_features, axis=1)
testFeaturesU = testFeatures.filter(selected_featuresTest, axis=1)

In [451]:

normalizer = StandardScaler()

from sklearn.svm import LinearSVC
train_y = trainFeaturesU.iloc[:,0]
train_x = trainFeaturesU.iloc[:,1:]
train_x = normalizer.fit_transform(train_x)

dev_y = devFeaturesU.iloc[:,0]
dev_x = devFeaturesU.iloc[:,1:]
dev_x = normalizer.transform(dev_x)

pipe = make_pipeline(StandardScaler(), SVC(kernel='linear', C=1, class_weight='balanced'))
pipe.fit(train_x, train_y)

dev_y_pred = pipe.predict(dev_x)

acc = accuracy_score(dev_y, dev_y_pred)

print("SVM acc",acc)


SVM acc 0.7320441988950276


In [452]:


test_x = testFeaturesU.iloc[:,0:]
test_x = normalizer.fit_transform(test_x)

test_y_preds = pipe.predict(test_x)

ids = testS['id'].to_list()
prediction = pd.DataFrame()
prediction["id"] = testS['id']
prediction["output"] = test_y_preds

prediction.to_csv('/content/sample_data/midterm/ChristineCho_test_result.txt', sep=' ', index=False, header=False)