# Shared Task
## Preparations

In [67]:
import pandas as pd
import spacy
import os
import csv
import nltk

nltk.download('wordnet')
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

from nltk.corpus import wordnet as wn
from tqdm import tqdm_notebook as tqdm
from pprint import pprint
from spacy import displacy
from enum import Enum

nlp = spacy.load('en_core_web_lg')
pd.set_option('display.max_colwidth',-1)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\syim\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**LOAD DATA**

In [2]:
folder = '../Documents/ELEXIS/codalab/public_dat/train'

#all_data = pd.read_csv('../Documents/ELEXIS/codalab/public_dat/train/english_kd.tsv',sep='\t', header=None)
all_data = pd.read_csv(folder + '/' + 'english_kd.tsv', sep='\t', header=None)
print(all_data)

             0            1                                     2  \
0    off        preposition  away from and no longer touching       
1    off        preposition  away from and no longer touching       
2    off        preposition  away from and no longer touching       
3    off        preposition  in a position away from                
4    off        preposition  in a position away from                
..   ...                ...                      ...                
551  on         preposition  immediately following                  
552  one        number       the number 1                           
553  one        number       the number 1                           
554  offspring  noun         a person's child or an animal's baby   
555  offspring  noun         a person's child or an animal's baby   

                                                  3         4  
0    away from; down from                            narrower  
1    not wanting or allowed to have (food e

In [3]:
def add_column_names(df):
    column_names = ['word','pos','def1','def2','relation']
    df.columns = column_names
    
def load_data(file_path):
    loaded_data = pd.read_csv(file_path, sep='\t', header=None)
    add_column_names(loaded_data)
    
    return loaded_data

def load_training_data(folder):
    all_data = {}
    
    for filename in os.listdir(folder):
        if filename.endswith(".tsv"):
            all_data[filename.split('.')[0]] = load_data(folder + '/' + filename)
    
    return all_data

def train_classifier(data):
    for lang in data:
        print(lang)
        
        analyze_by_class(data[lang])

        train_set, test_set = prepare_data(data[lang])
        
        classifier = nltk.NaiveBayesClassifier.train(train_set)
        
        print(nltk.classify.accuracy(classifier, test_set))
        classifier.show_most_informative_features(5)
        print('\n')

def analyze_by_class(dataset):
    separated = dict()
    
    for vector in dataset:
        class_value = vector[1]
        if (class_value not in separated):
            separated[class_value] = list()
        separated[class_value].append(vector[0])

    for s in separated:
        print(s, len(separated[s]))

    return separated

def prepare_data(dataset):
    featuresets = [(find_features(row), label) for (row, label) in dataset]
    f = int(len(featuresets) / 5)
    print(f)
    train_set, test_set = featuresets[f:], featuresets[:f]
    print(len(train_set), len(test_set))
    return train_set, test_set



def find_features(row):
    features = {}
    features['first_word_same'] = (row['def1'].split(' ')[0].lower() == row['def2'].split(' ')[0].lower())
    features['len difference'] = abs(len(row['def1'].split(' ')) - len(row['def2'].split(' ')[0]))

    wordmatch = 0
    for word in row['def1'].split(' ')[0].lower():
        if word in row['def2'].lower():
            wordmatch+=1

    features['wordmatch'] = wordmatch

    features['synsets'] = len(wn.synsets(row['lemma'])) #for specific pos e.g. wn.synsets('dog', pos=wn.VERB)

    #if features['synsets'] == 0:
    #    print('no synset for ',row['lemma']) TODO MULTILIGNUAL WN

    return features

all_data = load_training_data(folder)
en_data = all_data['english_kd']
print(set(en_data['relation']))



{'broader', 'narrower', 'none', 'related', 'exact'}


**Add Text Classifier to the pipeline**

**Print only narrower relations**

In [4]:
#df['def1']=df['def1'].str.wrap(20)
is_narrower = en_data['relation']=='narrower'
print(en_data[is_narrower])

         word          pos  \
0    off       preposition   
3    off       preposition   
15   off       adverb        
47   off       adverb        
67   off       adverb        
113  offer     verb          
114  offer     verb          
133  officer   adjective     
138  official  adjective     
143  offline   adjective     
152  oil       noun          
153  oil       noun          
241  only      conjunction   
242  onward    adverb        
243  onward    adverb        
244  opaque    adjective     
245  opaque    adjective     
319  open      verb          
321  open      verb          
323  open      verb          
328  office    noun          
338  oily      adjective     
385  on        preposition   
465  on        preposition   

                                                                                      def1  \
0    away from and no longer touching                                                        
3    in a position away from                                 

**Run Spacy NLP Pipeline**

In [5]:
def spacyDocForVec(vec):
    doc_list = []
    
    for doc in tqdm(vec):
        pr = nlp(doc)
        doc_list.append(pr)
    
    return doc_list
    
doc_list = spacyDocForVec(en_data['def1'])
doc_list2 = spacyDocForVec(en_data['def2'])

HBox(children=(IntProgress(value=0, max=556), HTML(value='')))




HBox(children=(IntProgress(value=0, max=556), HTML(value='')))




In [6]:
#df['def1_nlp']=doc_list
#df['def2_nlp']=doc_list2

In [7]:
t1 = doc_list2[0]
for token in doc_list2[0]:
    print(token.text, "| lemma:", token.lemma_, "| norm:" , token.norm_, "| pos:" ,token.pos_, "| tag:", token.tag_, "| dep:", token.dep_, "| sentiment:", token.sentiment)

doc_list2[0]

away | lemma: away | norm: away | pos: ADV | tag: RB | dep: ROOT | sentiment: 0.0
from | lemma: from | norm: from | pos: ADP | tag: IN | dep: prep | sentiment: 0.0
; | lemma: ; | norm: ; | pos: PUNCT | tag: : | dep: punct | sentiment: 0.0
down | lemma: down | norm: down | pos: ADV | tag: RB | dep: advmod | sentiment: 0.0
from | lemma: from | norm: from | pos: ADP | tag: IN | dep: prep | sentiment: 0.0


away from; down from

In [8]:
en_data
frame = pd.DataFrame({'doc1': doc_list, 'doc2': doc_list2})
en_data['doc1'] = frame['doc1']
en_data['doc2'] = frame['doc2']

In [54]:
def similarityVector(row):
    doc1 = row['doc1']
    doc2 = row['doc2']
    
    return doc1.similarity(doc2)
    
def first_word_same(row):
     return (row['def1'].split(' ')[0].lower() == row['def2'].split(' ')[0].lower())

def difference_in_length(row):
    return abs(len(row['def1'].split(' ')) - len(row['def2'].split(' ')[0]))


features = pd.DataFrame()

similarities = []
first_word = []
length = []

for i, row in en_data.iterrows():
    similarities.append(similarityVector(row))
    first_word.append(first_word_same(row))
    length.append(difference_in_length(row)) 

features['similarities'] = preprocessing.scale(similarities)
features['first_word_same'] = first_word
features['length'] = preprocessing.scale(length)

features

Unnamed: 0,similarities,first_word_same,length
0,1.105435,True,-0.491229
1,0.597624,False,-0.118619
2,0.323444,False,-0.118619
3,0.976754,False,-0.863840
4,0.051192,False,-0.491229
...,...,...,...
551,-0.417336,False,0.999212
552,1.472181,True,-1.236450
553,0.885584,True,-1.236450
554,1.067770,False,-0.863840


In [55]:
#en_data['similarities'] = similarities
labels = en_data['relation']

en_data.columns

Index(['word', 'pos', 'def1', 'def2', 'relation', 'doc1', 'doc2'], dtype='object')

In [74]:
X_train, X_valid, y_train, y_valid = train_test_split(features, labels, test_size = 0.2)

LR = LogisticRegression().fit(X_train, y_train)
print(LR.predict(X_valid))

print(LR.score(X_valid, y_valid))

## Linear kernal won't work very well, experiment with nonlinear ones.
SVM = svm.LinearSVC()
SVM.fit(X_train, y_train)
print(SVM.predict(X_valid))
print(round(SVM.score(X_valid,y_valid), 4))

RF = RandomForestClassifier(max_depth=3, random_state=0)
RF.fit(X_train, y_train)
print(RF.predict(X_valid))
print(round(RF.score(X_valid,y_valid), 4))




['none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none'
 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none'
 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none'
 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none'
 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none'
 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none'
 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none'
 'none' 'none' 'none' 'none' 'none' 'none' 'exact' 'none' 'none' 'none'
 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none'
 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none'
 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none'
 'none' 'none']
0.7589285714285714
['none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none'
 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none'
 'none' 'none' 'none' 'none' 'none' 'none



['none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none'
 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none'
 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none'
 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none'
 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none'
 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none'
 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none'
 'none' 'none' 'none' 'none' 'none' 'none' 'exact' 'exact' 'none' 'none'
 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'exact'
 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none'
 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none' 'none'
 'none' 'none']
0.7679


In [23]:
if "textcat" not in nlp.pipe_names:
    textcat = nlp.create_pipe(
        "textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"}
    )
    nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
else:
    textcat = nlp.get_pipe("textcat")
    
textcat.add_label('related')
textcat.add_label('exact')
textcat.add_label('broader')
textcat.add_label('narrower')
textcat.add_label('none')




# TODO
## Experiment with NLP Pipeline

In [16]:
def lemmatizer(doc):
        # This takes in a doc of tokens from the NER and lemmatizes them. 
    # Pronouns (like "I" and "you" get lemmatized to '-PRON-', so I'm removing those.
    doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)

def remove_stopwords(doc):
    # This will remove stopwords and punctuation.
    # Use token.text to return strings, which we'll need for Gensim.
    doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    return doc

# The add_pipe function appends our functions to the default pipeline.
#nlp.add_pipe(lemmatizer,name='lemmatizer',after='ner')
#nlp.add_pipe(remove_stopwords, name="stopwords", last=True)