In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
import gensim 
import numpy as np
import pandas as pd
import csv
import sys
import os
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
os.chdir(r'C:\Users\anaverageone\htlt_env\Machine_Learning')
print(os.getcwd())

C:\Users\anaverageone\htlt_env\Machine_Learning


In [17]:
word_embedding_model = gensim.models.KeyedVectors.load_word2vec_format('models/GoogleNews-vectors-negative300.bin.gz', binary=True)  

In [8]:
train_file= 'data/conll2003.train.conll'
test_file = 'data/conll2003.test.conll'
with open(train_file) as f:
    train_cap = f.read() + '\n'
with open(test_file) as f:
    test_cap = f.read() + '\n'
train_text, test_text = open("data/conll_train_cap.txt", "w"), open("data/conll_test_cap.txt", "w")
train_text.write(train_cap)
test_text.write(test_cap)
train_text.close()
test_text.close()

In [10]:
trainfile = 'data/conll_train_cap.txt'
testfile = 'data/conll_test_cap.txt'

def extract_features_token_only_and_labels(conllfile):
    '''Function that extracts features and gold label from preprocessed conll (here: tokens only).
    
    :param conllfile: path to the (preprocessed) conll file
    :type conllfile: string
    
    
    :return features: a list of dictionaries, with key-value pair providing the value for the feature `token' for individual instances
    :return labels: a list of gold labels of individual instances
    '''
    
    features = []
    labels = []
    conllinput = open(conllfile, 'r')
    
    #delimiter indicates we are working with a tab separated value (default is comma)
    #quotechar has as default value '"', which is used to indicate the borders of a cell containing longer pieces of text
    #in this file, we have only one token as text, but this token can be '"', which then messes up the format. We set quotechar to a character that does not occur in our file
    csvreader = csv.reader(conllinput, delimiter='\t',quotechar='|')
    for row in csvreader:
        #I preprocessed the file so that all rows with instances should contain 6 values, the others are empty lines indicating the beginning of a sentence
        if len(row) == 4:
            #structuring feature value pairs as key-value pairs in a dictionary
            #the first column in the conll file represents tokens
            feature_value = {'Token': row[0]}
            features.append(feature_value)
            #The last column provides the gold label (= the correct answer). 
            labels.append(row[-1])
    
    return features, labels


def create_vectorizer_and_classifier(features, labels):
    '''
    Function that takes feature-value pairs and gold labels as input and trains a logistic regression classifier
    
    :param features: feature-value pairs
    :param labels: gold labels
    :type features: a list of dictionaries
    :type labels: a list of strings
    
    :return lr_classifier: a trained LogisticRegression classifier
    :return vec: a DictVectorizer to which the feature values are fitted. 
    '''
    
    vec = DictVectorizer()
    #fit creates a mapping between observed feature values and dimensions in a one-hot vector, transform represents the current values as a vector 
    tokens_vectorized = vec.fit_transform(features)
    lr_classifier = LogisticRegression(solver='saga')
    lr_classifier.fit(tokens_vectorized, labels)
    
    return lr_classifier, vec


In [12]:
#extract features and labels:
feature_values, labels = extract_features_token_only_and_labels(trainfile) 
#create vectorizer and trained classifier:
lr_classifier, vectorizer = create_vectorizer_and_classifier(feature_values, labels)

In [13]:
def get_predicted_and_gold_labels_token_only(testfile, vectorizer, classifier):
    '''
    Function that extracts features and runs classifier on a test file returning predicted and gold labels
    
    :param testfile: path to the (preprocessed) test file
    :param vectorizer: vectorizer in which the mapping between feature values and dimensions is stored
    :param classifier: the trained classifier
    :type testfile: string
    :type vectorizer: DictVectorizer
    :type classifier: LogisticRegression()
    :return predictions: list of output labels provided by the classifier on the test file
    :return goldlabels: list of gold labels as included in the test file
    '''
     
    #we use the same function as above (guarantees features have the same name and form)
    sparse_feature_reps, goldlabels = extract_features_token_only_and_labels(testfile)
    #we need to use the same fitting as before, so now we only transform the current features according to this mapping (using only transform)
    test_features_vectorized = vectorizer.transform(sparse_feature_reps)
    predictions = classifier.predict(test_features_vectorized)
    
    return predictions, goldlabels

In [14]:
def print_confusion_matrix(predictions, goldlabels):
    '''
    Function that prints out a confusion matrix
    
    :param predictions: predicted labels
    :param goldlabels: gold standard labels
    :type predictions, goldlabels: list of strings
    '''
    #based on example from https://datatofish.com/confusion-matrix-python/ 
    data = {'Gold':    goldlabels, 'Predicted': predictions    }
    df = pd.DataFrame(data, columns=['Gold','Predicted'])

    confusion_matrix = pd.crosstab(df['Gold'], df['Predicted'], rownames=['Gold'], colnames=['Predicted'])
    print (confusion_matrix)

In [15]:
def print_precision_recall_fscore(predictions, goldlabels):
    '''
    Function that prints out precision, recall and f-score
    
    :param predictions: predicted output by classifier
    :param goldlabels: original gold labels
    :type predictions, goldlabels: list of strings
    '''
    
    precision = metrics.precision_score(y_true=goldlabels,
                        y_pred=predictions,
                        average='macro')

    recall = metrics.recall_score(y_true=goldlabels,
                     y_pred=predictions,
                     average='macro')


    fscore = metrics.f1_score(y_true=goldlabels,
                 y_pred=predictions,
                 average='macro')

    print('P:', precision, 'R:', recall, 'F1:', fscore)
    
#vectorizer and lr_classifier are the vectorizer and classifiers created in the previous cell.
#it is important that the same vectorizer is used for both training and testing: they should use the same mapping from values to dimensions


**Using word embeddings**

In [19]:
# word_embedding_model = gensim.models.KeyedVectors.load_word2vec_format('../models/GoogleNews-vectors-negative300.bin.gz', binary=True)  

def extract_embeddings_as_features_and_gold(conllfile,word_embedding_model):
    '''
    Function that extracts features and gold labels using word embeddings
    
    :param conllfile: path to conll file
    :param word_embedding_model: a pretrained word embedding model
    :type conllfile: string
    :type word_embedding_model: gensim.models.keyedvectors.Word2VecKeyedVectors
    
    :return features: list of vector representation of tokens
    :return labels: list of gold labels
    '''
    labels = []
    features = []
    conllinput = open(conllfile, 'r')
    csvreader = csv.reader(conllinput, delimiter='\t',quotechar='|')
    for row in csvreader:
        if len(row) == 4:
            if row[0] in word_embedding_model:
                vector = word_embedding_model[row[0]]
            else:
                vector = [0]*300
            features.append(vector)
            labels.append(row[-1])
    return features, labels

def create_classifier(features, labels):
    '''
    Function that creates classifier from features represented as vectors and gold labels
    
    :param features: list of vector representations of tokens
    :param labels: list of gold labels
    :type features: list of vectors
    :type labels: list of strings
    
    :returns trained logistic regression classifier
    '''
    
    
    lr_classifier = LogisticRegression(solver='saga')
    lr_classifier.fit(features, labels)
    
    return lr_classifier

    
def label_data_using_word_embeddings(testfile, word_embedding_model, classifier):
    '''
    Function that extracts word embeddings as features and gold labels from test data and runs a classifier
    
    :param testfile: path to test file
    :param word_embedding_model: distributional semantic model
    :param classifier: trained classifier
    :type testfile: string
    :type word_embedding_model: gensim.models.keyedvectors.Word2VecKeyedVectors
    :type classifier: LogisticRegression
    
    :return predictions: list of predicted labels
    :return labels: list of gold labels
    '''
    
    dense_feature_representations, labels = extract_embeddings_as_features_and_gold(testfile,word_embedding_model)
    predictions = classifier.predict(dense_feature_representations)
    
    return predictions, labels


In [21]:
# I printing announcements of where the code is at (since some of these steps take a while)
print('Extracting dense features...')
dense_feature_representations, labels = extract_embeddings_as_features_and_gold(trainfile,word_embedding_model)
print('Training classifier....')
classifier = create_classifier(dense_feature_representations, labels)
print('Running evaluation...')
predictions, gold = label_data_using_word_embeddings(testfile, word_embedding_model, classifier)
# print_confusion_matrix(predictions, goldlabels)
# print_precision_recall_fscore(predicted, gold)

Extracting dense features...
Training classifier....
Running evaluation...


In [22]:
report = classification_report(gold, predictions, zero_division=0)
print (report)

              precision    recall  f1-score   support

       B-LOC       0.77      0.82      0.79      1668
      B-MISC       0.71      0.69      0.70       702
       B-ORG       0.69      0.66      0.67      1661
       B-PER       0.74      0.67      0.70      1617
       I-LOC       0.54      0.48      0.51       257
      I-MISC       0.55      0.54      0.54       216
       I-ORG       0.51      0.39      0.44       835
       I-PER       0.57      0.49      0.53      1156
           O       0.98      0.99      0.98     38323

    accuracy                           0.93     46435
   macro avg       0.67      0.63      0.65     46435
weighted avg       0.92      0.93      0.93     46435



**Including preceding token**

In [23]:
def extract_embeddings_of_current_and_preceding_as_features_and_gold(conllfile,word_embedding_model):
    '''
    Function that extracts features and gold labels using word embeddings for current and preceding token
    
    :param conllfile: path to conll file
    :param word_embedding_model: a pretrained word embedding model
    :type conllfile: string
    :type word_embedding_model: gensim.models.keyedvectors.Word2VecKeyedVectors
    
    :return features: list of vector representation of tokens
    :return labels: list of gold labels
    '''
    labels = []
    features = []
    
    conllinput = open(conllfile, 'r')
    csvreader = csv.reader(conllinput, delimiter='\t',quotechar='|')
    for row in csvreader:
        if len(row) == 4:
            if row[0] in word_embedding_model:
                vector1 = word_embedding_model[row[0]]
            else:
                vector1 = [0]*300
            if row[1] in word_embedding_model:
                vector2 = word_embedding_model[row[1]]
            else:
                vector2 = [0]*300
            features.append(np.concatenate((vector1,vector2)))
            labels.append(row[-1])
    return features, labels
    
    
def label_data_using_word_embeddings_current_and_preceding(testfile, word_embedding_model, classifier):
    '''
    Function that extracts word embeddings as features (of current and preceding token) and gold labels from test data and runs a trained classifier
    
    :param testfile: path to test file
    :param word_embedding_model: distributional semantic model
    :param classifier: trained classifier
    :type testfile: string
    :type word_embedding_model: gensim.models.keyedvectors.Word2VecKeyedVectors
    :type classifier: LogisticRegression
    
    :return predictions: list of predicted labels
    :return labels: list of gold labels
    '''
    
    features, labels = extract_embeddings_of_current_and_preceding_as_features_and_gold(testfile,word_embedding_model)
    predictions = classifier.predict(features)
    
    return predictions, labels


In [24]:
print('Extracting dense features...')
features, labels = extract_embeddings_of_current_and_preceding_as_features_and_gold(trainfile,word_embedding_model)
print('Training classifier...')
#we can use the same function as for just the tokens itself
classifier = create_classifier(features, labels)
print('Running evaluation...')
predicted, gold = label_data_using_word_embeddings_current_and_preceding(testfile, word_embedding_model, classifier)
# print_confusion_matrix(predictions, goldlabels)
# print_precision_recall_fscore(predicted, gold)

Extracting dense features...
Training classifier...
Running evaluation...


In [25]:
report = classification_report(gold, predictions, zero_division=0)
print (report)

              precision    recall  f1-score   support

       B-LOC       0.77      0.82      0.79      1668
      B-MISC       0.71      0.69      0.70       702
       B-ORG       0.69      0.66      0.67      1661
       B-PER       0.74      0.67      0.70      1617
       I-LOC       0.54      0.48      0.51       257
      I-MISC       0.55      0.54      0.54       216
       I-ORG       0.51      0.39      0.44       835
       I-PER       0.57      0.49      0.53      1156
           O       0.98      0.99      0.98     38323

    accuracy                           0.93     46435
   macro avg       0.67      0.63      0.65     46435
weighted avg       0.92      0.93      0.93     46435



**A mixed system**

In [30]:
# selected_features = ['pos','chunk']
feature_to_index = {'Token': 0, 'pos': 1, 'chunk': 2}

def extract_word_embedding(token, word_embedding_model):
    '''
    Function that returns the word embedding for a given token out of a distributional semantic model and a 300-dimension vector of 0s otherwise
    
    :param token: the token
    :param word_embedding_model: the distributional semantic model
    :type token: string
    :type word_embedding_model: gensim.models.keyedvectors.Word2VecKeyedVectors
    
    :returns a vector representation of the token
    '''
    if token in word_embedding_model:
        vector = word_embedding_model[token]
    else:
        vector = [0]*300
    return vector

def extract_feature_values(row, selected_features):
    '''
    Function that extracts feature value pairs from row
    
    :param row: row from conll file
    :param selected_features: list of selected features
    :type row: string
    :type selected_features: list of strings
    
    :returns: dictionary of feature value pairs
    '''
    feature_values = {}
    for feature_name in selected_features:
        r_index = feature_to_index.get(feature_name)
        feature_values[feature_name] = row[r_index]
        
    return feature_values
    
    
def create_vectorizer_traditional_features(feature_values):
    '''
    Function that creates vectorizer for set of feature values
    
    :param feature_values: list of dictionaries containing feature-value pairs
    :type feature_values: list of dictionairies (key and values are strings)
    
    :returns: vectorizer with feature values fitted
    '''
    vectorizer = DictVectorizer()
    vectorizer.fit(feature_values)
    
    return vectorizer
def combine_sparse_and_dense_features(dense_vectors, sparse_features):
    '''
    Function that takes sparse and dense feature representations and appends their vector representation
    
    :param dense_vectors: list of dense vector representations
    :param sparse_features: list of sparse vector representations
    :type dense_vector: list of arrays
    :type sparse_features: list of lists
    
    :returns: list of arrays in which sparse and dense vectors are concatenated
    '''
    
    combined_vectors = []
    sparse_vectors = np.array(sparse_features.toarray())
    
    for index, vector in enumerate(sparse_vectors):
        combined_vector = np.concatenate((vector,dense_vectors[index]))
        combined_vectors.append(combined_vector)
    return combined_vectors
    

def extract_traditional_features_and_embeddings_plus_gold_labels(conllfile, word_embedding_model, vectorizer=None):
    '''
    Function that extracts traditional features as well as embeddings and gold labels using word embeddings for current and preceding token
    
    :param conllfile: path to conll file
    :param word_embedding_model: a pretrained word embedding model
    :type conllfile: string
    :type word_embedding_model: gensim.models.keyedvectors.Word2VecKeyedVectors
    
    :return features: list of vector representation of tokens
    :return labels: list of gold labels
    '''
    labels = []
    dense_vectors = []
    traditional_features = []
    
    conllinput = open(conllfile, 'r')
    csvreader = csv.reader(conllinput, delimiter='\t',quotechar='|')
    for row in csvreader:
        if len(row) == 4:
            token_vector = extract_word_embedding(row[0], word_embedding_model)
            pt_vector = extract_word_embedding(row[1], word_embedding_model)
            dense_vectors.append(np.concatenate((token_vector,pt_vector)))
            #mixing very sparse representations (for one-hot tokens) and dense representations is a bad idea
            #we thus only use other features with limited values
            other_features = extract_feature_values(row, ['pos','chunk'])
            traditional_features.append(other_features)
            #adding gold label to labels
            labels.append(row[-1])
            
    #create vector representation of traditional features
    if vectorizer is None:
        #creates vectorizer that provides mapping (only if not created earlier)
        vectorizer = create_vectorizer_traditional_features(traditional_features)
    sparse_features = vectorizer.transform(traditional_features)
    combined_vectors = combine_sparse_and_dense_features(dense_vectors, sparse_features)
    
    return combined_vectors, vectorizer, labels

def label_data_with_combined_features(testfile, classifier, vectorizer, word_embedding_model):
    '''
    Function that labels data with model using both sparse and dense features
    '''
    feature_vectors, vectorizer, goldlabels = extract_traditional_features_and_embeddings_plus_gold_labels(testfile, word_embedding_model, vectorizer)
    predictions = classifier.predict(feature_vectors)
    
    return predictions, goldlabels

In [31]:
print('Extracting Features...')
feature_vectors, vectorizer, gold_labels = extract_traditional_features_and_embeddings_plus_gold_labels(trainfile, word_embedding_model)

Extracting Features...


In [32]:
print('Training classifier....')
lr_classifier = create_classifier(feature_vectors, gold_labels)
print('Running the evaluation...')
predictions, goldlabels = label_data_with_combined_features(testfile, lr_classifier, vectorizer, word_embedding_model)
# print_confusion_matrix(predictions, goldlabels)
# print_precision_recall_fscore(predictions, goldlabels)

Training classifier....




Running the evaluation...


In [33]:
report = classification_report(goldlabels, predictions, zero_division=0)
print (report)

              precision    recall  f1-score   support

       B-LOC       0.77      0.81      0.79      1668
      B-MISC       0.71      0.69      0.70       702
       B-ORG       0.70      0.65      0.67      1661
       B-PER       0.86      0.78      0.82      1617
       I-LOC       0.51      0.47      0.49       257
      I-MISC       0.56      0.58      0.57       216
       I-ORG       0.57      0.48      0.52       835
       I-PER       0.71      0.81      0.76      1156
           O       0.98      0.99      0.99     38323

    accuracy                           0.94     46435
   macro avg       0.71      0.70      0.70     46435
weighted avg       0.94      0.94      0.94     46435

