## Generating the tagged training set using semi-supervised approach

### Imports

In [198]:
import pandas as pd
import os
import nltk
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import f1_score, classification_report
from bs4 import BeautifulSoup
from nltk.corpus import names,gazetteers 
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_f1_score, flat_classification_report
import re

### Importing the validation set

In [199]:
data = pd.read_csv("./un-negated_clean_data.csv")

In [200]:
data

Unnamed: 0,Tokens,Tags,is_negative,sentence
0,I,O,False,0
1,am,O,False,0
2,looking,O,False,0
3,for,O,False,0
4,a,O,False,0
...,...,...,...,...
2345,30,B-SIZE,False,166
2346,inch,I-SIZE,False,166
2347,white,B-COLOUR,False,166
2348,desk,B-PRODUCT,False,166


### Data wrangling

In [201]:
data = data[["Tokens","Tags","sentence"]]

In [202]:
data

Unnamed: 0,Tokens,Tags,sentence
0,I,O,0
1,am,O,0
2,looking,O,0
3,for,O,0
4,a,O,0
...,...,...,...
2345,30,B-SIZE,166
2346,inch,I-SIZE,166
2347,white,B-COLOUR,166
2348,desk,B-PRODUCT,166


#### Converting the B- Tokens to I- Tokens

In [203]:
data = data.replace('B-','I-', regex=True)

In [204]:
data

Unnamed: 0,Tokens,Tags,sentence
0,I,O,0
1,am,O,0
2,looking,O,0
3,for,O,0
4,a,O,0
...,...,...,...
2345,30,I-SIZE,166
2346,inch,I-SIZE,166
2347,white,I-COLOUR,166
2348,desk,I-PRODUCT,166


In [205]:
data['Sentence']= data[['sentence','Tokens','Tags']].groupby(['sentence'])['Tokens'].transform(lambda x: ' '.join(x))

In [206]:
data['Tags']= data[['sentence','Tokens','Tags']].groupby(['sentence'])['Tags'].transform(lambda x: ','.join(x))

In [207]:
data = data[['Sentence','Tags']]

In [208]:
data = data.drop_duplicates().reset_index(drop=True)

In [209]:
data

Unnamed: 0,Sentence,Tags
0,I am looking for a black gloss 33 inch firecla...,"O,O,O,O,O,I-COLOUR,I-TEXTURE,I-SIZE,I-SIZE,I-P..."
1,Looking for pre workout Pump addict instead of...,"O,O,I-PRODUCT,I-PRODUCT,I-PRODUCT,I-PRODUCT,O,..."
2,i need a 48 inch glass sliding goof and a show...,"O,O,O,I-SIZE,I-SIZE,I-MATERIAL,I-PRODUCT,I-PRO..."
3,"Hello, do any of your free standing tubs have ...","O,O,O,O,O,I-ATTRIBUTE,I-ATTRIBUTE,I-PRODUCT,O,..."
4,I'm looking for a 24 inch white mirror that is...,"O,O,O,O,I-SIZE,I-SIZE,I-COLOUR,I-PRODUCT,O,O,I..."
...,...,...
162,What rectangular shower units are available?,"O,I-SHAPE,I-PRODUCT,I-PRODUCT,O,O"
163,I am looking for a 35 inch bathroom sink count...,"O,O,O,O,O,I-SIZE,I-SIZE,I-PRODUCT,I-PRODUCT,I-..."
164,I'm looking f or a Black Matte bathtub double ...,"O,O,O,O,O,I-COLOUR,I-TEXTURE,I-PRODUCT,I-PRODU..."
165,Need a 27 inch frameless shower door,"O,O,I-SIZE,I-SIZE,I-ATTRIBUTE,I-PRODUCT,I-PRODUCT"


### Splitting the data for a bit of validation

In [210]:
train_size = 0.95
train_dataset = data.sample(frac=train_size,random_state=200)
test_dataset = data.drop(train_dataset.index).reset_index(drop=True)

In [211]:
def strsplit_tags(tags):
    sent_list = tags.split(",")
    return sent_list

In [212]:
def strsplit_sentence(sentence):
    sent_list = sentence.split(" ")
    return sent_list

In [213]:
train_sents = []

for index, row in train_dataset.iterrows():
    train_sents.append((strsplit_sentence(row['Sentence']),strsplit_tags(row['Tags'])))


In [214]:
dev_sents = []

for index, row in test_dataset.iterrows():
    dev_sents.append((strsplit_sentence(row['Sentence']),strsplit_tags(row['Tags'])))

## Importing and wrangling knowledge bases to create features

In [215]:
colors = pd.read_csv("colors.csv")

In [216]:
colors

Unnamed: 0,air_force_blue_raf,Air Force Blue (Raf),#5d8aa8,93,138,168
0,air_force_blue_usaf,Air Force Blue (Usaf),#00308f,0,48,143
1,air_superiority_blue,Air Superiority Blue,#72a0c1,114,160,193
2,alabama_crimson,Alabama Crimson,#a32638,163,38,56
3,alice_blue,Alice Blue,#f0f8ff,240,248,255
4,alizarin_crimson,Alizarin Crimson,#e32636,227,38,54
...,...,...,...,...,...,...
859,yellow_orange,Yellow Orange,#ffae42,255,174,66
860,yellow_process,Yellow (Process),#ffef00,255,239,0
861,yellow_ryb,Yellow (Ryb),#fefe33,254,254,51
862,zaffre,Zaffre,#0014a8,0,20,168


In [217]:
color_list= []
for index, row in colors.iterrows():
    color_list.append(row['Air Force Blue (Raf)'])

In [218]:
products = pd.read_csv('products.csv',sep='\t')

In [219]:
product_list =[]
for index, row  in products.iterrows():
    for i in row[0].split():
        if i[-1]=='s':
            product_list.append(i[:-1])
        else:
            product_list.append(i)
            
        

In [220]:
product_list = set(product_list)

In [221]:
product_list = list(product_list)

### Feature Creation

In [222]:

import nltk


def get_pos(word):
    tag = nltk.pos_tag([word])
    return tag[0][1]

def is_number(string):
    return any(char.isdigit() for char in string)

def word2features(sentence, idx):
    word_features = {}
    word_features['word_lowercase'] = sentence[idx].lower()

    ## Features looking at the neighbouring words:
    
    if idx > 0:
        word_features["pre_word"] = sentence[idx -1].lower()
    else:
        word_features["pre_word"] = ""
    if idx < len(sentence) - 1:
        word_features["next_word"] = sentence[idx +1].lower()
    else:
        word_features["next_word"] = ""
        
    if idx > 1:
        word_features["pre2_word"] = sentence[idx -2].lower()
    else:
        word_features["pre2_word"] = ""
        
    if idx < len(sentence) - 2:
        word_features["next2_word"] = sentence[idx +2].lower()
    else:
        word_features["next2_word"] = ""
    ## Features loking at the word endings
    
    if len(sentence[idx])> 2:
        word_features["last2char"] = sentence[idx][-2:]
    else:
        word_features["last2char"] = sentence[idx]
    
    if len(sentence[idx])> 3:
        word_features["last3char"] = sentence[idx][-3:]
    else:
        word_features["last3char"] = sentence[idx]
        
    ## Features considering the shape of the word
    
    if sentence[idx].isupper():
        word_features["upper"] = True
    else:
        word_features["upper"] = False  
        
    if sentence[idx].islower():
        word_features["lower"] = True
    else:
        word_features["lower"] = False 
    
    word_features["length"] = len(sentence[idx])
    word_features["position"] = idx
    
    
    ## Extra Features:
    
    ## Is Number
    word_features["number"] = is_number(sentence[idx])
    
#     if is_number(sentence[idx]) == True:
#         word_features["num_length"] = len(sentence[idx])
#     else:
#         word_features["num_length"] = 0
    
    ##Is_Noun
    
    if get_pos(sentence[idx])== "NN":
        word_features["is_noun"] = True
    else:
        word_features["is_noun"] = False
    

#     color Feature:
    
    if sentence[idx].lower() in color_list:
        word_features["color"] = True
    else:
        word_features["color"] = False
        
#     Product Feature:
    
    if sentence[idx].lower() in product_list:
        word_features["product"] = True
    else:
        word_features["product"] = False



        
    for i in range(len(sentence)):
        if i == idx and i != 0:
            word_features['first_word_not_in_title_case'] = sentence[idx].istitle()
        elif i == idx and i == 0:
            if sentence[idx].istitle():
                word_features['first_word_not_in_title_case'] = False
    
        
    return word_features
    
    
def sentence2features(sentence):
    return [word2features(sentence, idx) for idx in range(len(sentence))]

In [223]:
def prepare_ner_feature_dicts(sents):
    '''ner_files is a list of Ontonotes files with NER annotations. Returns feature dictionaries and 
    IOB tags for each token in the entire dataset'''
    all_dicts = []
    all_tags = []
    # your code here
    for tokens, tags in sents:
        all_dicts.append(sentence2features(tokens))
        all_tags.append(tags)

    return all_dicts, all_tags

In [224]:
train_dicts, train_tags = prepare_ner_feature_dicts(train_sents)
dev_dicts, dev_tags = prepare_ner_feature_dicts(dev_sents)

### Training the CRF model

In [225]:
import sklearn_crfsuite

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=350,
    all_possible_transitions=True,
    verbose = True
)
crf.fit(train_dicts, train_tags)
try:
    call_produces_an_error()
except:
    pass

loading training data to CRFsuite: 100%|██████████| 159/159 [00:00<00:00, 3302.37it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 5617
Seconds required: 0.015

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 350
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.01  loss=4783.66  active=5575  feature_norm=0.50
Iter 2   time=0.00  loss=3769.24  active=5394  feature_norm=0.36
Iter 3   time=0.00  loss=3447.56  active=5395  feature_norm=0.30
Iter 4   time=0.00  loss=3328.91  active=5525  feature_norm=0.34
Iter 5   time=0.00  loss=3278.86  active=5537  feature_norm=0.35
Iter 6   time=0.00  loss=3218.45  active=5521  feature_norm=0.40
Iter 7   time=0.00  loss=3148.81  active=5522  feature_norm=0.47
Iter 8   time=0.00  loss=3035.48  active=5462  feature_norm=0.76
Iter 9   time=0.00  loss=2915.12  active=5512  feature_norm=0.85
Iter 10  time=0

In [226]:
def flatten(l):
    result = []
    for sub in l:
        result.extend(sub)
    return result

y_pred = crf.predict(dev_dicts)
print(f1_score(flatten(dev_tags), flatten(y_pred), average='macro'))
print(f1_score(flatten(dev_tags), flatten(y_pred), average='micro'))
print(classification_report(flatten(dev_tags), flatten(y_pred)))

0.5378450817475208
0.79375
              precision    recall  f1-score   support

 I-ATTRIBUTE       0.13      0.40      0.20         5
    I-COLOUR       1.00      0.11      0.20         9
  I-MATERIAL       1.00      1.00      1.00         1
     I-PRICE       0.00      0.00      0.00         1
   I-PRODUCT       0.87      0.50      0.63        26
      I-SIZE       0.71      1.00      0.83         5
           O       0.87      0.93      0.90       113

    accuracy                           0.79       160
   macro avg       0.65      0.56      0.54       160
weighted avg       0.84      0.79      0.79       160



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Tagging and Generating the training set

In [227]:
from re import search, sub
    
def sentence2iob(sentence):
    global count
    '''Input sentence is a string from the Ontonotes corpus, with xml tags indicating named entities
    output is a list of tokens and a list of NER IOB-tags corresponding to those tokens'''
    
    tokens=[]
    tags=[]

    soup = BeautifulSoup(sentence, "lxml")
    if soup.find('doc'):
        return tokens, tags
    
    if not soup.find('enamex'):
        words = soup.getText().split()
        for word in words:
            tokens.append(word)
            tags.append('O')

    else:
        for content in soup.find(soup.enamex.parent.name).contents:
            if content.name == 'enamex':
                tag = content['type']
                entity = content.getText().split()

                for i in range(len(entity)):
                    if i == 0:
                        tags.append("B-" + tag)
                        tokens.append(entity[i])
                    else:
                        tags.append("I-" + tag)
                        tokens.append(entity[i])
            
            else:
                for word in content.split():
                    tokens.append(word)
                    tags.append('O')

    return tokens, tags

In [228]:
test_sents = []
with open('Hey_train.txt', encoding="utf-8") as f:
    for sentence in f:
        curr_tokens, curr_tags = sentence2iob(sentence)
        assert "" not in curr_tokens # if you have empty strings, you've done something wrong
        test_sents.append((curr_tokens, curr_tags))
            
test_dicts, test_tags = prepare_ner_feature_dicts(test_sents)
test_pred = crf.predict(test_dicts)
test_pred = flatten(test_pred)

In [229]:
words =[]
sentence_index =[]
sent_count =0
for i in test_sents:
    for j in i[0]:
        words.append(j)
        sentence_index.append(sent_count)
    sent_count += 1
        

In [230]:
import csv
rows = zip(sentence_index,words, test_pred)
with open('Heyday_trainingdata.csv', "w") as f:
    writer = csv.writer(f)
    writer.writerow(('Sentence_Index','Tokens','Tags'))
    for row in rows:
        writer.writerow(row)