In [25]:
import pandas as pd
with open('train/english_nuig.tsv', 'rb') as en:
    res = pd.read_csv(en, sep='\t', header=None)


In [26]:
res

Unnamed: 0,0,1,2,3,4
0,squall,verb,blow in a squall,"to cry out; to scream or cry violently, as a w...",none
1,squall,verb,"make high-pitched, whiney noises","to cry out; to scream or cry violently, as a w...",none
2,squall,verb,utter a sudden loud cry,"to cry out; to scream or cry violently, as a w...",exact
3,tumid,adjective,abnormally distended especially by fluids or gas,"swelled, enlarged, or distended;",exact
4,tumid,adjective,abnormally distended especially by fluids or gas,rising above the level; protuberant.,none
...,...,...,...,...,...
8332,offer,verb,mount or put up,to make an attempt; to make an essay or a tria...,none
8333,offer,verb,mount or put up,to present in words; to proffer; to make a pro...,none
8334,offer,verb,mount or put up,to attempt; to undertake.,none
8335,offer,verb,mount or put up,"to bid, as a price, reward, or wages;",none


In [28]:
def related(rel):
    if rel in ['broader','narrower','exact']:
        rel = 'related'
    else:
        rel = 'none'
        
    return rel

        


In [29]:
res[4] = res[4].map(lambda rel: related(rel))

In [30]:
res

Unnamed: 0,0,1,2,3,4
0,squall,verb,blow in a squall,"to cry out; to scream or cry violently, as a w...",none
1,squall,verb,"make high-pitched, whiney noises","to cry out; to scream or cry violently, as a w...",none
2,squall,verb,utter a sudden loud cry,"to cry out; to scream or cry violently, as a w...",related
3,tumid,adjective,abnormally distended especially by fluids or gas,"swelled, enlarged, or distended;",related
4,tumid,adjective,abnormally distended especially by fluids or gas,rising above the level; protuberant.,none
...,...,...,...,...,...
8332,offer,verb,mount or put up,to make an attempt; to make an essay or a tria...,none
8333,offer,verb,mount or put up,to present in words; to proffer; to make a pro...,none
8334,offer,verb,mount or put up,to attempt; to undertake.,none
8335,offer,verb,mount or put up,"to bid, as a price, reward, or wages;",none


In [31]:
res.to_csv('english_nuig_binary.tsv',sep='\t', index = False, header=False)

In [85]:
import os
os.chdir('/Users/seungbinyim/Development/repos/codalab/Codalab')
from classifier_config import ClassifierConfig
from feature_extractor import FeatureExtractor
from sklearn.tree import DecisionTreeClassifier
from model_trainer import ModelTrainer
from wsa_classifier import WordSenseAlignmentClassifier
from copy import deepcopy
from spacy_wordnet.wordnet_annotator import WordnetAnnotator
from nltk.corpus import wordnet as wn
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestClassifier

In [86]:
english_config = ClassifierConfig('en_core_web_lg', "english", 'data/train', balancing_strategy="none",
                                      testset_ratio=0.0, with_wordnet= True, dataset='english_nuig', logger = 'en_nuig')
feature_extractor = FeatureExtractor() \
        .diff_pos_count() \
        .ont_hot_pos() \
        .matching_lemma() \
        .count_each_pos() \
        .avg_count_synsets() \
        .difference_in_length()\
        .similarity_diff_to_target()\
        .max_dependency_tree_depth() \
        .target_word_synset_count()\
        .token_count_norm_diff()\
        .semicol_count()\
    #.elmo_similarity()

rf = {
    'estimator': RandomForestClassifier(),
    'parameters': {
        'class_weight': ['balanced', 'balanced_subsample', ],
        'max_depth': [10, 20],
        'max_features': ['auto', 'sqrt', 'log2', None],
        'min_samples_leaf': [2],
        'min_samples_split': [5, 10],
        'n_estimators': [300, 800],
        'n_jobs':[8]
    }
}

model_trainer = ModelTrainer(english_config.testset_ratio, english_config.logger)
model_trainer.add_estimators([rf])
english_classifier = WordSenseAlignmentClassifier(english_config, feature_extractor, model_trainer)


In [73]:
data = english_classifier.load_data()

In [74]:
data_binary = deepcopy(data._data)
data_binary['relation'] = data_binary['relation'].map(lambda rel: related(rel))


In [77]:
feats = feature_extractor.extract(data_binary, ['len_diff','pos_diff'])

In [87]:
models = model_trainer.train(feats, data_binary['relation'],with_testset=False)

# Tuning hyper-parameters for precision

Performing grid search...
parameters:
{'class_weight': ['balanced', 'balanced_subsample'],
 'max_depth': [10, 20],
 'max_features': ['auto', 'sqrt', 'log2', None],
 'min_samples_leaf': [2],
 'min_samples_split': [5, 10],
 'n_estimators': [300, 800],
 'n_jobs': [8]}
Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   14.8s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:  6.9min finished


# Tuning hyper-parameters for recall

Performing grid search...
parameters:
{'class_weight': ['balanced', 'balanced_subsample'],
 'max_depth': [10, 20],
 'max_features': ['auto', 'sqrt', 'log2', None],
 'min_samples_leaf': [2],
 'min_samples_split': [5, 10],
 'n_estimators': [300, 800],
 'n_jobs': [8]}
Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   18.6s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:  7.2min finished


# Tuning hyper-parameters for f1

Performing grid search...
parameters:
{'class_weight': ['balanced', 'balanced_subsample'],
 'max_depth': [10, 20],
 'max_features': ['auto', 'sqrt', 'log2', None],
 'min_samples_leaf': [2],
 'min_samples_split': [5, 10],
 'n_estimators': [300, 800],
 'n_jobs': [8]}
Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   20.1s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:  7.5min finished


In [88]:
models

[RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                        criterion='gini', max_depth=10, max_features='log2',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=2, min_samples_split=10,
                        min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=8,
                        oob_score=False, random_state=None, verbose=0,
                        warm_start=False),
 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                        criterion='gini', max_depth=20, max_features='sqrt',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=2, min_samples_split=5,
                        min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=8,

In [89]:
best_model = models[2]

In [98]:
probs = best_model.predict_proba(feats)
probs_df = pd.DataFrame(probs)
related_vec = probs_df[1]
related_vec.name = 'relatedness'
feats_related = feats.join(related_vec)
feats_related

Unnamed: 0,first_word_same,similarities,pos_diff,cos_tfidf,verb,adverb,adjective,noun,lemma_match_normalized,AUX,...,len_diff,simdiff_to_target,max_depth_deptree_1,max_depth_deptree_2,target_word_synset_count,token_count_norm_diff,semicol_count1_norm,semicol_count2_norm,semicol_diff,relatedness
0,False,0.690449,-1.047162,0.000000,1.0,0.0,0.0,0.0,0.000000,0.200626,...,-0.752110,0.690925,3,5,3,-0.770526,0.0,1.107833,-1.107833,0.052124
1,False,0.717846,-0.689953,0.000000,1.0,0.0,0.0,0.0,0.000000,0.200626,...,-0.752110,-0.042160,3,5,3,-0.413021,0.0,1.107833,-1.107833,0.082992
2,False,0.788656,-1.047162,0.273604,1.0,0.0,0.0,0.0,0.076923,0.200626,...,-0.533028,0.013535,2,5,3,-0.651358,0.0,1.107833,-1.107833,0.827039
3,False,0.684154,1.453301,0.296092,0.0,0.0,1.0,0.0,0.125000,0.200626,...,-0.971192,0.000000,3,2,3,0.470411,0.0,0.553917,-0.553917,0.780044
4,False,0.600150,0.381674,0.000000,0.0,0.0,1.0,0.0,0.000000,0.200626,...,-0.971192,0.000000,3,4,3,0.470411,0.0,0.553917,-0.553917,0.061014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8332,False,0.771537,-0.332744,0.000000,1.0,0.0,0.0,0.0,0.000000,0.200626,...,-0.752110,-0.139813,2,4,13,-0.406760,0.0,1.107833,-1.107833,0.020648
8333,False,0.736976,-1.047162,0.000000,1.0,0.0,0.0,0.0,0.000000,0.200626,...,-0.752110,-0.826633,2,4,13,-1.498058,0.0,3.323500,-3.323500,0.050839
8334,False,0.682163,1.096092,0.000000,1.0,0.0,0.0,0.0,0.000000,0.200626,...,-0.752110,-0.205281,2,2,13,0.164873,0.0,0.553917,-0.553917,0.035871
8335,False,0.671622,-0.332744,0.000000,1.0,0.0,0.0,0.0,0.000000,0.200626,...,-0.752110,-0.324619,2,4,13,-0.146927,0.0,0.553917,-0.553917,0.057794


In [150]:
models_5class = model_trainer.train(feats_related, data._data['relation'],with_testset=False)

# Tuning hyper-parameters for precision

Performing grid search...
parameters:
{'class_weight': ['balanced', 'balanced_subsample'],
 'max_depth': [10, 20],
 'max_features': ['auto', 'sqrt', 'log2', None],
 'min_samples_leaf': [2],
 'min_samples_split': [5, 10],
 'n_estimators': [300, 800],
 'n_jobs': [8]}
Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   12.2s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:  5.5min finished


# Tuning hyper-parameters for recall

Performing grid search...
parameters:
{'class_weight': ['balanced', 'balanced_subsample'],
 'max_depth': [10, 20],
 'max_features': ['auto', 'sqrt', 'log2', None],
 'min_samples_leaf': [2],
 'min_samples_split': [5, 10],
 'n_estimators': [300, 800],
 'n_jobs': [8]}
Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:  6.1min finished


# Tuning hyper-parameters for f1

Performing grid search...
parameters:
{'class_weight': ['balanced', 'balanced_subsample'],
 'max_depth': [10, 20],
 'max_features': ['auto', 'sqrt', 'log2', None],
 'min_samples_leaf': [2],
 'min_samples_split': [5, 10],
 'n_estimators': [300, 800],
 'n_jobs': [8]}
Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   18.0s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:  6.5min finished


In [114]:
models_5class


[RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                        class_weight='balanced_subsample', criterion='gini',
                        max_depth=10, max_features='auto', max_leaf_nodes=None,
                        max_samples=None, min_impurity_decrease=0.0,
                        min_impurity_split=None, min_samples_leaf=2,
                        min_samples_split=5, min_weight_fraction_leaf=0.0,
                        n_estimators=300, n_jobs=8, oob_score=False,
                        random_state=None, verbose=0, warm_start=False),
 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                        class_weight='balanced_subsample', criterion='gini',
                        max_depth=20, max_features='log2', max_leaf_nodes=None,
                        max_samples=None, min_impurity_decrease=0.0,
                        min_impurity_split=None, min_samples_leaf=2,
                        min_samples_split=5, min_weight_fraction_leaf=0.0,
        

In [115]:
test_config = ClassifierConfig('en_core_web_lg', "english", 'data/test', balancing_strategy="none",
                                      testset_ratio=0.0, with_wordnet= True, dataset='english_nuig', logger = 'en_nuig', is_testdata=True)
test_feature_extractor = FeatureExtractor() \
        .diff_pos_count() \
        .ont_hot_pos() \
        .matching_lemma() \
        .count_each_pos() \
        .avg_count_synsets() \
        .difference_in_length()\
        .similarity_diff_to_target()\
        .max_dependency_tree_depth() \
        .target_word_synset_count()\
        .token_count_norm_diff()\
        .semicol_count()\
    #.elmo_similarity()
rf = {
    'estimator': RandomForestClassifier(),
    'parameters': {
        'class_weight': ['balanced', 'balanced_subsample'],
        'max_depth': [10, 20],
        'max_features': ['auto', 'sqrt', 'log2'],
        'min_samples_leaf': [2],
        'min_samples_split': [5, 10],
        'n_estimators': [300, 800],
        'n_jobs':[8]
    }
}


test_model_trainer = ModelTrainer(test_config.testset_ratio, test_config.logger)
test_model_trainer.add_estimators([rf])
test_classifier = WordSenseAlignmentClassifier(test_config, test_feature_extractor, test_model_trainer)

In [121]:
testdata =test_classifier.load_data().get_preprocessed_data()

In [123]:
test_features = test_feature_extractor.extract(testdata, feats_to_scale = ['len_diff', 'pos_diff'])

In [124]:
testset, empty = test_model_trainer.split_data(test_features, 0.0)

In [127]:
#Predict with Binary classifier
test_binary = best_model.predict_proba(testset)

In [130]:
test_binary_rel_vec = pd.DataFrame(test_binary)[1]
test_binary_rel_vec
test_binary_rel_vec.name = 'relatedness'
test_binary_feat = testset.join(test_binary_rel_vec)

In [131]:
test_binary_feat

Unnamed: 0,first_word_same,similarities,pos_diff,cos_tfidf,verb,adverb,adjective,noun,lemma_match_normalized,AUX,...,len_diff,simdiff_to_target,max_depth_deptree_1,max_depth_deptree_2,target_word_synset_count,token_count_norm_diff,semicol_count1_norm,semicol_count2_norm,semicol_diff,relatedness
0,True,0.641546,0.686678,0.000000,0.0,0.0,1.0,0.0,0.000000,0.195073,...,-1.221630,-0.246587,1,3,3,0.001345,0.000000,1.827548,-1.827548,0.247555
1,True,0.731661,0.686678,0.347267,0.0,0.0,1.0,0.0,0.166667,0.195073,...,-0.582008,-0.190916,2,3,3,0.349914,0.000000,1.827548,-1.827548,0.683896
2,False,0.751481,2.716855,0.000000,0.0,0.0,1.0,0.0,0.000000,1.505194,...,1.336856,-0.051665,7,3,3,1.511811,6.181818,1.827548,4.354271,0.139167
3,False,0.791688,0.348315,0.000000,0.0,0.0,0.0,1.0,0.000000,0.195073,...,1.123649,0.000000,5,3,2,0.882605,0.000000,1.827548,-1.827548,0.409512
4,False,0.808421,0.348315,0.000000,0.0,0.0,0.0,1.0,0.000000,0.195073,...,0.697234,0.000000,6,3,2,0.998795,6.181818,1.827548,4.354271,0.404687
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
539,False,0.827427,-1.681863,0.000000,1.0,0.0,0.0,0.0,0.000000,-2.425169,...,0.697234,-0.097543,5,8,10,-1.499702,0.000000,0.609183,-0.609183,0.043939
540,False,0.798985,0.348315,0.000000,1.0,0.0,0.0,0.0,0.000000,0.195073,...,0.697234,-0.211726,5,3,10,0.831656,0.000000,0.609183,-0.609183,0.048368
541,False,0.890995,-1.005137,0.113132,1.0,0.0,0.0,0.0,0.038462,-1.115048,...,0.697234,-0.663050,5,10,10,-1.598908,0.000000,0.000000,0.000000,0.225948
542,False,0.691457,-0.666774,0.131586,0.0,0.0,0.0,1.0,0.050000,0.195073,...,-0.368801,-0.170843,3,5,2,-0.890169,0.000000,0.609183,-0.609183,0.458067


In [140]:
test_binary_feat

Unnamed: 0,first_word_same,similarities,pos_diff,cos_tfidf,verb,adverb,adjective,noun,lemma_match_normalized,AUX,...,len_diff,simdiff_to_target,max_depth_deptree_1,max_depth_deptree_2,target_word_synset_count,token_count_norm_diff,semicol_count1_norm,semicol_count2_norm,semicol_diff,relatedness
0,True,0.641546,0.686678,0.000000,0.0,0.0,1.0,0.0,0.000000,0.195073,...,-1.221630,-0.246587,1,3,3,0.001345,0.000000,1.827548,-1.827548,0.247555
1,True,0.731661,0.686678,0.347267,0.0,0.0,1.0,0.0,0.166667,0.195073,...,-0.582008,-0.190916,2,3,3,0.349914,0.000000,1.827548,-1.827548,0.683896
2,False,0.751481,2.716855,0.000000,0.0,0.0,1.0,0.0,0.000000,1.505194,...,1.336856,-0.051665,7,3,3,1.511811,6.181818,1.827548,4.354271,0.139167
3,False,0.791688,0.348315,0.000000,0.0,0.0,0.0,1.0,0.000000,0.195073,...,1.123649,0.000000,5,3,2,0.882605,0.000000,1.827548,-1.827548,0.409512
4,False,0.808421,0.348315,0.000000,0.0,0.0,0.0,1.0,0.000000,0.195073,...,0.697234,0.000000,6,3,2,0.998795,6.181818,1.827548,4.354271,0.404687
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
539,False,0.827427,-1.681863,0.000000,1.0,0.0,0.0,0.0,0.000000,-2.425169,...,0.697234,-0.097543,5,8,10,-1.499702,0.000000,0.609183,-0.609183,0.043939
540,False,0.798985,0.348315,0.000000,1.0,0.0,0.0,0.0,0.000000,0.195073,...,0.697234,-0.211726,5,3,10,0.831656,0.000000,0.609183,-0.609183,0.048368
541,False,0.890995,-1.005137,0.113132,1.0,0.0,0.0,0.0,0.038462,-1.115048,...,0.697234,-0.663050,5,10,10,-1.598908,0.000000,0.000000,0.000000,0.225948
542,False,0.691457,-0.666774,0.131586,0.0,0.0,0.0,1.0,0.050000,0.195073,...,-0.368801,-0.170843,3,5,2,-0.890169,0.000000,0.609183,-0.609183,0.458067


In [153]:
#Predict with 5 class
five_class_result = models_5class[2].predict(test_binary_feat)

In [156]:
result_series = pd.Series(five_class_result)

In [157]:
testdata['relation'] = result_series
five_class_predcted = testdata[['word','pos','def1','def2','relation']]

In [159]:
five_class_predcted.to_csv('doublelayer.tsv',sep='\t', index = False, header = False)

In [160]:
five_class_predcted


Unnamed: 0,word,pos,def1,def2,relation
0,unusual,adjective,not commonly encountered,not usual; uncommon; rare;,none
1,unusual,adjective,not usual or common or ordinary,not usual; uncommon; rare;,exact
2,unusual,adjective,being definitely out of the ordinary and unexp...,not usual; uncommon; rare;,none
3,tramper,noun,someone who walks with a heavy noisy gait or w...,one who tramps; a stroller; a vagrant or vagab...,none
4,tramper,noun,a foot traveler; someone who goes on an extend...,one who tramps; a stroller; a vagrant or vagab...,none
...,...,...,...,...,...
539,register,verb,record in a public office or in a court of law,"to correspond in relative position; , when t...",none
540,register,verb,record in a public office or in a court of law,to enroll; to enter in a list.,none
541,register,verb,record in a public office or in a court of law,to enter the name of the owner of (a share of ...,none
542,cottonwood,noun,American basswood of the Allegheny region,"an american tree of the genus or poplar, havi...",none
