## Semi-supervised approach for Named-Entity Recognition

### Imports

In [101]:
import pandas as pd
import os
import nltk
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import f1_score, classification_report
from bs4 import BeautifulSoup
from nltk.corpus import names,gazetteers 
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_f1_score, flat_classification_report
import re

### Importing the tagged training data consisting of GTP generated sentences and amazon reviews

In [102]:
data= pd.read_csv('Heyday_trainingdata.csv')

In [103]:
data

Unnamed: 0,Sentence_Index,Tokens,Tags
0,0,"""Hello,",O
1,0,I,O
2,0,need,O
3,0,a,O
4,0,standup,I-ATTRIBUTE
...,...,...,...
3731,337,inches),O
3732,337,of,O
3733,337,even,O
3734,337,tinier,I-ATTRIBUTE


### Data Wrangling

In [104]:
data['Sentence']= data[['Sentence_Index','Tokens','Tags']].groupby(['Sentence_Index'])['Tokens'].transform(lambda x: ' '.join(x))

In [105]:
data['Tags']= data[['Sentence_Index','Tokens','Tags']].groupby(['Sentence_Index'])['Tags'].transform(lambda x: ','.join(x))

In [106]:
data = data[['Sentence','Tags']]

In [107]:
data = data.drop_duplicates().reset_index(drop=True)

In [108]:
data

Unnamed: 0,Sentence,Tags
0,"""Hello, I need a standup shower door only, no ...","O,O,O,O,I-ATTRIBUTE,I-PRODUCT,I-PRODUCT,O,O,I-..."
1,I'm looking for a double sink kitchen close to...,"O,O,O,O,I-ATTRIBUTE,I-PRODUCT,I-PRODUCT,O,O,I-..."
2,"""I'm looking for bypass shower doors in chrome...","O,O,O,O,I-PRODUCT,I-PRODUCT,O,I-MATERIAL,O,I-M..."
3,"""I am looking at the 34""""x34"""" shower kit with...","O,O,O,O,O,I-SIZE,I-PRODUCT,I-PRODUCT,O,I-ATTRI..."
4,"""Can the 2"""" backsplash, on the 60"""" single si...","O,O,I-SIZE,I-ATTRIBUTE,O,O,I-SIZE,I-PRODUCT,I-..."
...,...,...
261,Great for kids learning to read,"O,O,O,O,O,O"
262,They weren't kidding about this being a little...,"O,O,O,O,O,O,O,O,O"
263,"Students in my office exact comment: ""These ar...","O,O,O,O,I-PRODUCT,I-PRODUCT,O,O,O"
264,love it,"O,O"


### Loading the training and validation set

In [109]:
train_dataset= data
test_dataset = pd.read_csv('un-negated_clean_data.csv')
test_dataset = test_dataset[["Tokens","Tags","sentence"]]
test_dataset = test_dataset.replace('B-','I-', regex=True)
test_dataset['Sentence']= test_dataset[['sentence','Tokens','Tags']].groupby(['sentence'])['Tokens'].transform(lambda x: ' '.join(x))
test_dataset['Tags']= test_dataset[['sentence','Tokens','Tags']].groupby(['sentence'])['Tags'].transform(lambda x: ','.join(x))
test_dataset = test_dataset[['Sentence','Tags']]
test_dataset = test_dataset.drop_duplicates().reset_index(drop=True)

In [110]:
test_dataset

Unnamed: 0,Sentence,Tags
0,I am looking for a black gloss 33 inch firecla...,"O,O,O,O,O,I-COLOUR,I-TEXTURE,I-SIZE,I-SIZE,I-P..."
1,Looking for pre workout Pump addict instead of...,"O,O,I-PRODUCT,I-PRODUCT,I-PRODUCT,I-PRODUCT,O,..."
2,i need a 48 inch glass sliding goof and a show...,"O,O,O,I-SIZE,I-SIZE,I-MATERIAL,I-PRODUCT,I-PRO..."
3,"Hello, do any of your free standing tubs have ...","O,O,O,O,O,I-ATTRIBUTE,I-ATTRIBUTE,I-PRODUCT,O,..."
4,I'm looking for a 24 inch white mirror that is...,"O,O,O,O,I-SIZE,I-SIZE,I-COLOUR,I-PRODUCT,O,O,I..."
...,...,...
162,What rectangular shower units are available?,"O,I-SHAPE,I-PRODUCT,I-PRODUCT,O,O"
163,I am looking for a 35 inch bathroom sink count...,"O,O,O,O,O,I-SIZE,I-SIZE,I-PRODUCT,I-PRODUCT,I-..."
164,I'm looking f or a Black Matte bathtub double ...,"O,O,O,O,O,I-COLOUR,I-TEXTURE,I-PRODUCT,I-PRODU..."
165,Need a 27 inch frameless shower door,"O,O,I-SIZE,I-SIZE,I-ATTRIBUTE,I-PRODUCT,I-PRODUCT"


In [111]:
def strsplit_tags(tags):
    sent_list = tags.split(",")
    return sent_list

In [112]:
def strsplit_sentence(sentence):
    sent_list = sentence.split(" ")
    return sent_list

In [113]:
train_sents = []

for index, row in train_dataset.iterrows():
    train_sents.append((strsplit_sentence(row['Sentence']),strsplit_tags(row['Tags'])))

In [114]:
dev_sents = []

for index, row in test_dataset.iterrows():
    dev_sents.append((strsplit_sentence(row['Sentence']),strsplit_tags(row['Tags'])))

### Creating features using knowledge bases

In [115]:
colors = pd.read_csv("colors.csv")

In [116]:
colors

Unnamed: 0,air_force_blue_raf,Air Force Blue (Raf),#5d8aa8,93,138,168
0,air_force_blue_usaf,Air Force Blue (Usaf),#00308f,0,48,143
1,air_superiority_blue,Air Superiority Blue,#72a0c1,114,160,193
2,alabama_crimson,Alabama Crimson,#a32638,163,38,56
3,alice_blue,Alice Blue,#f0f8ff,240,248,255
4,alizarin_crimson,Alizarin Crimson,#e32636,227,38,54
...,...,...,...,...,...,...
859,yellow_orange,Yellow Orange,#ffae42,255,174,66
860,yellow_process,Yellow (Process),#ffef00,255,239,0
861,yellow_ryb,Yellow (Ryb),#fefe33,254,254,51
862,zaffre,Zaffre,#0014a8,0,20,168


In [117]:
color_list= []
for index, row in colors.iterrows():
    color_list.append(row['Air Force Blue (Raf)'])

In [118]:
products = pd.read_csv('products.csv',sep='\t')

In [119]:
product_list =[]
for index, row  in products.iterrows():
    for i in row[0].split():
        if i[-1]=='s':
            product_list.append(i[:-1])
        else:
            product_list.append(i)

In [120]:
product_list = set(product_list)
product_list = list(product_list)

### Feature Engineering

In [121]:

import nltk


def get_pos(word):
    tag = nltk.pos_tag([word])
    return tag[0][1]

def is_number(string):
    return any(char.isdigit() for char in string)

def word2features(sentence, idx):
    word_features = {}
    word_features['word_lowercase'] = sentence[idx].lower()

    ## Features looking at the neighbouring words:
    
    if idx > 0:
        word_features["pre_word"] = sentence[idx -1].lower()
    else:
        word_features["pre_word"] = ""
    if idx < len(sentence) - 1:
        word_features["next_word"] = sentence[idx +1].lower()
    else:
        word_features["next_word"] = ""
        
    if idx > 1:
        word_features["pre2_word"] = sentence[idx -2].lower()
    else:
        word_features["pre2_word"] = ""
        
    if idx < len(sentence) - 2:
        word_features["next2_word"] = sentence[idx +2].lower()
    else:
        word_features["next2_word"] = ""
    ## Features loking at the word endings
    
    if len(sentence[idx])> 2:
        word_features["last2char"] = sentence[idx][-2:]
    else:
        word_features["last2char"] = sentence[idx]
    
    if len(sentence[idx])> 3:
        word_features["last3char"] = sentence[idx][-3:]
    else:
        word_features["last3char"] = sentence[idx]
        
    ## Features considering the shape of the word
    
    if sentence[idx].isupper():
        word_features["upper"] = True
    else:
        word_features["upper"] = False  
        
    if sentence[idx].islower():
        word_features["lower"] = True
    else:
        word_features["lower"] = False 
    
    word_features["length"] = len(sentence[idx])
    word_features["position"] = idx
    
    
    ## Extra Features:
    
    ## Is Number
    word_features["number"] = is_number(sentence[idx])
    
#     if is_number(sentence[idx]) == True:
#         word_features["num_length"] = len(sentence[idx])
#     else:
#         word_features["num_length"] = 0
    
    ##Is_Noun
    
    if get_pos(sentence[idx])== "NN":
        word_features["is_noun"] = True
    else:
        word_features["is_noun"] = False
    

#     color Feature:
    
    if sentence[idx].lower() in color_list:
        word_features["color"] = True
    else:
        word_features["color"] = False
        
#     Product Feature:
    
    if sentence[idx].lower() in product_list:
        word_features["product"] = True
    else:
        word_features["product"] = False



        
    for i in range(len(sentence)):
        if i == idx and i != 0:
            word_features['first_word_not_in_title_case'] = sentence[idx].istitle()
        elif i == idx and i == 0:
            if sentence[idx].istitle():
                word_features['first_word_not_in_title_case'] = False
    
        
    return word_features
    
    
def sentence2features(sentence):
    return [word2features(sentence, idx) for idx in range(len(sentence))]

In [122]:
def prepare_ner_feature_dicts(sents):
    '''ner_files is a list of Ontonotes files with NER annotations. Returns feature dictionaries and 
    IOB tags for each token in the entire dataset'''
    all_dicts = []
    all_tags = []
    # your code here
    for tokens, tags in sents:
        all_dicts.append(sentence2features(tokens))
        all_tags.append(tags)

    return all_dicts, all_tags

In [123]:
train_dicts, train_tags = prepare_ner_feature_dicts(train_sents)
dev_dicts, dev_tags = prepare_ner_feature_dicts(dev_sents)

### CRF Model

In [124]:
import sklearn_crfsuite

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=350,
    all_possible_transitions=True,
    verbose = True
)
crf.fit(train_dicts, train_tags)
try:
    call_produces_an_error()
except:
    pass

loading training data to CRFsuite: 100%|██████████| 266/266 [00:00<00:00, 3405.15it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 7199
Seconds required: 0.021

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 350
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.01  loss=4575.83  active=7172  feature_norm=0.50
Iter 2   time=0.00  loss=4094.20  active=6892  feature_norm=0.42
Iter 3   time=0.00  loss=3570.06  active=6871  feature_norm=0.37
Iter 4   time=0.00  loss=3436.77  active=7034  feature_norm=0.43
Iter 5   time=0.00  loss=3197.22  active=7078  feature_norm=0.58
Iter 6   time=0.00  loss=3048.60  active=6932  feature_norm=0.91
Iter 7   time=0.00  loss=2854.62  active=7049  feature_norm=0.94
Iter 8   time=0.00  loss=2744.74  active=7033  feature_norm=1.07
Iter 9   time=0.00  loss=2356.51  active=7009  feature_norm=1.88
Iter 10  time=0

### Results on the validation set

In [125]:
def flatten(l):
    result = []
    for sub in l:
        result.extend(sub)
    return result

y_pred = crf.predict(dev_dicts)
print(f1_score(flatten(dev_tags), flatten(y_pred), average='macro'))
print(f1_score(flatten(dev_tags), flatten(y_pred), average='micro'))
print(classification_report(flatten(dev_tags), flatten(y_pred)))

0.6785211515324803
0.9174468085106382
              precision    recall  f1-score   support

 I-ATTRIBUTE       0.84      0.78      0.81       246
    I-COLOUR       0.92      0.62      0.74        56
  I-MATERIAL       1.00      0.57      0.72        37
     I-PRICE       1.00      0.20      0.33        15
   I-PRODUCT       0.92      0.87      0.89       341
     I-SHAPE       1.00      0.36      0.53        11
      I-SIZE       0.96      0.96      0.96       184
   I-TEXTURE       1.00      0.08      0.15        12
           O       0.92      0.98      0.95      1448

    accuracy                           0.92      2350
   macro avg       0.95      0.60      0.68      2350
weighted avg       0.92      0.92      0.91      2350



### Tagging the Validation set

In [126]:
val_set= pd.read_csv('un-negated_clean_data.csv')

In [127]:
val_set

Unnamed: 0,Tokens,Tags,is_negative,sentence
0,I,O,False,0
1,am,O,False,0
2,looking,O,False,0
3,for,O,False,0
4,a,O,False,0
...,...,...,...,...
2345,30,B-SIZE,False,166
2346,inch,I-SIZE,False,166
2347,white,B-COLOUR,False,166
2348,desk,B-PRODUCT,False,166


In [128]:
token_list = val_set.Tokens.values.tolist()
sentence_list = val_set.sentence.values.tolist()


In [130]:
import csv
rows = zip(sentence_list,token_list,flatten(y_pred))
with open('Heyday_validationdata.csv', "w") as f:
    writer = csv.writer(f)
    writer.writerow(('Sentence_Index','Tokens','Predicted_NER_Tags'))
    for row in rows:
        writer.writerow(row)