In [1]:
# Library imports
import pandas as pd
import matplotlib.pyplot as plt

from nltk import PorterStemmer, WordNetLemmatizer
import codecs

import torch
from torch.utils.data import DataLoader, Dataset, Sampler
from tqdm import tqdm
import torch.nn as nn

import re

import numpy as np
from numpy import dot
from numpy.linalg import norm



# import torch.optim as optim
# import torch.nn.functional as F

def fix_seed(seed=420.69):
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
#   np.random.seed(seed)
#   random.seed(seed)

fix_seed()

data_path = '../data'
embeddings_path = '../word_embeddings'


In [2]:
if not torch.cuda.is_available():
  print('WARNING: You may want to change the runtime to GPU for faster training!')
  DEVICE = 'cpu'
else:
  DEVICE = 'cuda:0'

print(DEVICE)

cuda:0


# Importing the Data

(taken form binary-classification.ipynb)

In [3]:
# Import Data
train_data_path = f'{data_path}/dontpatronizeme_pcl.tsv'
test_data_path  = f'{data_path}/task4_test.tsv'

train_data = pd.read_csv(train_data_path, delimiter='\t', skiprows=4, header=None, names=['par_id','art_id','keyword','country_code', 'text','label'])
test_data  = pd.read_csv(test_data_path,  delimiter='\t', skiprows=4, header=None, names=['par_id','art_id','keyword','country_code', 'text'])

train_data = train_data.drop(['art_id'], axis=1)
test_data = test_data.drop(['art_id'], axis=1)

In [4]:
# Concatenate label information to train data
dev_label_path   = f'{data_path}/dev_semeval_parids-labels.csv'
train_label_path = f'{data_path}/train_semeval_parids-labels.csv'

dev_label   = pd.read_csv(dev_label_path, delimiter=',')
train_label = pd.read_csv(train_label_path, delimiter=',')

detailed_labels = pd.concat([dev_label, train_label], ignore_index=True, join='inner', names=['simple', 'detailed'])
train_data = pd.merge(train_data, detailed_labels, on='par_id')
train_data = train_data.rename(columns={'label_x': 'label', 'label_y': 'label_detailed'})

train_data = train_data.drop('par_id', axis=1)
test_data = test_data.drop('par_id', axis=1)

In [5]:
# Add Binary Classification column for ease of checking
train_data.loc[:, 'is_patronizing'] = False
train_data.loc[train_data['label'].isin([2,3,4]), 'is_patronizing'] = True

In [6]:
train_data = train_data.dropna(subset = 'text')
test_data = test_data.dropna(subset = 'text')

# Pre-processing ideas

## Synonym word replacement

- first article: https://towardsdatascience.com/data-preprocessing-in-nlp-c371d53ba3e0
- how to get list of synonyms: https://stackoverflow.com/questions/19258652/how-to-get-synonyms-from-nltk-wordnet-python

In [71]:
# For getting a list of synonyms for a word

from itertools import chain
from nltk.corpus import wordnet

source_word_for_synonym = 'desperate'

synonyms = wordnet.synsets(source_word_for_synonym)
lemmas = set(chain.from_iterable([word.lemma_names() for word in synonyms])) # returns a sorted list of synonyms (including the source word also)

lemmas_list = list(lemmas)
lemmas_list

['do-or-die', 'heroic', 'dire', 'desperate', 'despairing']

In [78]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

# Calculate cosine similarity
vectorizer = CountVectorizer().fit_transform(lemmas_list)
cosine_similarities = cosine_similarity(vectorizer)

print(cosine_similarities)

print(cosine_similarities[0, 1:])

# Find the index of the word with the highest similarity (excluding the source word itself)
most_similar_index = cosine_similarities[0, 1:].argmax()

# Get the most similar word
most_similar_word = lemmas_list[most_similar_index]

print(f"The most similar word to '{source_word_for_synonym}' is '{most_similar_word}' with a cosine similarity of {cosine_similarities[0, most_similar_index + 1]:.2f}")


[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]
[0. 0. 0. 0.]
The most similar word to 'desperate' is 'do-or-die' with a cosine similarity of 0.00


Example from https://stackoverflow.com/questions/73791396/python-cosine-similarity-between-sentences-with-synonyms

In [81]:
import math
import re
from collections import Counter

WORD = re.compile(r"\w+")    
def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator    
def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)

synonyms = {"India": "Hindustan",
            "USA": "America",}    

def map_synon(text):
    return ' '.join([w if not w in synonyms  else synonyms[w] for w in text.split(' ')])
text2 = "I live in India"
text2 = map_synon(text2)

sentences = ["He belongs to USA", 
            "Hindustan is synonym of my country name",
            "USA and America is same",
            "You live in a great country.",
            "All countries are great to live",]    
cosinetolist = []


for i in sentences:
    vector1 = text_to_vector(map_synon(i))
    vector2 = text_to_vector(text2) 
    cosine = get_cosine(vector1, vector2)
    cosinetolist.append((cosine,i,))

l = cosinetolist

for r in l:
    print(r)

(0.0, 'He belongs to USA')
(0.1889822365046136, 'Hindustan is synonym of my country name')
(0.0, 'USA and America is same')
(0.4082482904638631, 'You live in a great country.')
(0.20412414523193154, 'All countries are great to live')


From https://www.machinelearningplus.com/nlp/cosine-similarity/?utm_content=cmp-true

In [84]:
# Define the documents
doc_trump = "Mr. Trump became president after winning the political election. Though he lost the support of some republican friends, Trump is friends with President Putin"

doc_election = "President Trump says Putin had no political interference is the election outcome. He says it was a witchhunt by political parties. He claimed President Putin is a friend who had nothing to do with the election"

doc_putin = "Post elections, Vladimir Putin became President of Russia. President Putin had served as the Prime Minister earlier in his political career"

documents = [doc_trump, doc_election, doc_putin]

# Scikit Learn
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Create the Document Term Matrix
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer = CountVectorizer()
sparse_matrix = count_vectorizer.fit_transform(documents)

# OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
doc_term_matrix = sparse_matrix.todense()
df = pd.DataFrame(doc_term_matrix, 
                  columns=count_vectorizer.get_feature_names(), 
                  index=['doc_trump', 'doc_election', 'doc_putin'])
df

# Compute Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity(df, df))
#> [[ 1.          0.48927489  0.37139068]
#>  [ 0.48927489  1.          0.38829014]
#>  [ 0.37139068  0.38829014  1.        ]]

AttributeError: 'CountVectorizer' object has no attribute 'get_feature_names'