In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
import gensim 
import numpy as np
import pandas as pd
import csv
import sys
import os
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
os.chdir(r'C:\Users\anaverageone\htlt_env\Machine_Learning')

In [17]:
train_file= 'data/conll2003.train.conll'
dev_file = 'data/conll2003.dev.conll'
test_file = 'data/conll2003.test.conll'

In [8]:
word_embedding_model = gensim.models.KeyedVectors.load_word2vec_format('models/GoogleNews-vectors-negative300.bin.gz', binary=True)  

**Using word embeddings**

In [48]:
def extract_embeddings_as_features_and_gold(conllfile,word_embedding_model):
    '''
    Function that extracts features and gold labels using word embeddings
    
    :param conllfile: path to conll file
    :param word_embedding_model: a pretrained word embedding model
    :type conllfile: string
    :type word_embedding_model: gensim.models.keyedvectors.Word2VecKeyedVectors
    
    :return features: list of vector representation of tokens
    :return labels: list of gold labels
    '''
    labels = []
    features = []
    
    conllinput = open(conllfile, 'r')
    csvreader = csv.reader(conllinput, delimiter='\t',quotechar='|')
    for row in csvreader:
        if len(row) == 4:
            if row[0] in word_embedding_model:
                vector = word_embedding_model[row[0]]
            else:
                vector = [0]*300
            features.append(vector)
            labels.append(row[-1])
    return features, labels

def create_classifier(features, labels):
    '''
    Function that creates classifier from features represented as vectors and gold labels
    
    :param features: list of vector representations of tokens
    :param labels: list of gold labels
    :type features: list of vectors
    :type labels: list of strings
    
    :returns trained logistic regression classifier
    '''
    
    lr_classifier = LogisticRegression(max_iter=10000)
    # lr_classifier = LogisticRegression(solver='saga', max_iter=10000)
    lr_classifier.fit(features, labels)
    
    return lr_classifier

    
def label_data_using_word_embeddings(testfile, word_embedding_model, classifier):
    '''
    Function that extracts word embeddings as features and gold labels from test data and runs a classifier
    
    :param testfile: path to test file
    :param word_embedding_model: distributional semantic model
    :param classifier: trained classifier
    :type testfile: string
    :type word_embedding_model: gensim.models.keyedvectors.Word2VecKeyedVectors
    :type classifier: LogisticRegression
    
    :return predictions: list of predicted labels
    :return labels: list of gold labels
    '''
    
    dense_feature_representations, labels = extract_embeddings_as_features_and_gold(testfile,word_embedding_model)
    predictions = classifier.predict(dense_feature_representations)
    
    return predictions, labels

In [None]:
# printing announcements of where the code is at (since some of these steps take a while)
print('Extracting dense features...')
dense_feature_representations, labels = extract_embeddings_as_features_and_gold(train_file,word_embedding_model)
print('Training classifier....')
classifier = create_classifier(dense_feature_representations, labels)
print('Classifying data...')
predicted, gold = label_data_using_word_embeddings(dev_file, word_embedding_model, classifier)
print('Running evaluation...')
print('CONFUSION MATRIX')
cf_matrix = confusion_matrix(gold, predicted)
print(cf_matrix)
print('CLASSIFICATION REPORT')
report = classification_report(gold,predicted,digits = 8)
print(report)

Extracting dense features...
Training classifier....
Classifying data...
Running evaluation...
CONFUSION MATRIX
[[ 1516    17   128    12    26     0    23    23    92]
 [   51   650    33    21     4    16    14     2   131]
 [  122    36   885    31     8     4    77    46   132]
 [   29    10    20  1308     2     1     5   351   116]
 [   30     0     7     7   140     4    36    10    23]
 [    3    38    12     3    10   152     9    12   107]
 [   46    14    96    26    39    14   311    19   186]
 [   19     5    29   262     1     3    15   755   218]
 [    9    41    74     5     5    17    41     5 42562]]
CLASSIFICATION REPORT
              precision    recall  f1-score   support

       B-LOC  0.83068493 0.82525857 0.82796286      1837
      B-MISC  0.80147965 0.70498915 0.75014426       922
       B-ORG  0.68925234 0.65995526 0.67428571      1341
       B-PER  0.78089552 0.71009772 0.74381575      1842
       I-LOC  0.59574468 0.54474708 0.56910569       257
      I-MISC

**Including preceding token**

In [19]:
def extract_embeddings_of_current_and_preceding_as_features_and_gold(conllfile,word_embedding_model):
    '''
    Function that extracts features and gold labels using word embeddings for current and preceding token
    
    :param conllfile: path to conll file
    :param word_embedding_model: a pretrained word embedding model
    :type conllfile: string
    :type word_embedding_model: gensim.models.keyedvectors.Word2VecKeyedVectors
    
    :return features: list of vector representation of tokens
    :return labels: list of gold labels
    '''
    labels = []
    features = []
    
    conllinput = open(conllfile, 'r')
    csvreader = csv.reader(conllinput, delimiter='\t',quotechar='|')
    for row in csvreader:
        if len(row) == 4:
            if row[0] in word_embedding_model:
                vector1 = word_embedding_model[row[0]]
            else:
                vector1 = [0]*300
            if row[1] in word_embedding_model:
                vector2 = word_embedding_model[row[1]]
            else:
                vector2 = [0]*300
            features.append(np.concatenate((vector1,vector2)))
            labels.append(  row[-1])
    return features, labels

def label_data_using_word_embeddings_current_and_preceding(testfile, word_embedding_model, classifier):
    '''
    Function that extracts word embeddings as features (of current and preceding token) and gold labels from test data and runs a trained classifier
    
    :param testfile: path to test file
    :param word_embedding_model: distributional semantic model
    :param classifier: trained classifier
    :type testfile: string
    :type word_embedding_model: gensim.models.keyedvectors.Word2VecKeyedVectors
    :type classifier: LogisticRegression
    
    :return predictions: list of predicted labels
    :return labels: list of gold labels
    '''
    
    features, labels = extract_embeddings_of_current_and_preceding_as_features_and_gold(testfile,word_embedding_model)
    predictions = classifier.predict(features)
    
    return predictions, labels

In [19]:
print('Extracting dense features...')
features, labels = extract_embeddings_of_current_and_preceding_as_features_and_gold(train_file,word_embedding_model)
print('Training classifier...')
#we can use the same function as for just the tokens itself
classifier = create_classifier(features, labels)
print('Classifying data...')
predicted, gold = label_data_using_word_embeddings_current_and_preceding(dev_file, word_embedding_model, classifier)
print('Running evaluation...')
print('CONFUSION MATRIX')
cf_matrix = confusion_matrix(gold, predicted)
print(cf_matrix)
print('CLASSIFICATION REPORT')
report = classification_report(gold,predicted,digits = 8)
print(report)

Extracting dense features...
Training classifier...
Classifying data...
Running evaluation...
CONFUSION MATRIX
[[ 1516    21   128    11    27     0    31    26    77]
 [   41   668    29    16     4    16    18     9   121]
 [  117    36   892    35    14     9    84    47   107]
 [   30     8    22  1353     2     1     5   315   106]
 [   31     0     7     8   144     2    35     8    22]
 [    5    39    13     3    10   156    10    12    98]
 [   49    15   102    25    45    14   307    19   175]
 [   18     4    30   253     1     3    12   786   200]
 [    7    37    56     5     4    17    28     8 42597]]
CLASSIFICATION REPORT
              precision    recall  f1-score   support

       B-LOC  0.83572216 0.82525857 0.83045741      1837
      B-MISC  0.80676329 0.72451193 0.76342857       922
       B-ORG  0.69741986 0.66517524 0.68091603      1341
       B-PER  0.79169105 0.73452769 0.76203886      1842
       I-LOC  0.57370518 0.56031128 0.56692913       257
      I-MISC 

## A mixed system

The code below combines traditional features with word embeddings. Note that we only include features with a limited range of possible values. Combining one-hot token representations (using highly sparse dimensions) with dense representations is generally not a good idea.

In [54]:
def extract_word_embedding(token, word_embedding_model):
    '''
    Function that returns the word embedding for a given token out of a distributional semantic model and a 300-dimension vector of 0s otherwise
    
    :param token: the token
    :param word_embedding_model: the distributional semantic model
    :type token: string
    :type word_embedding_model: gensim.models.keyedvectors.Word2VecKeyedVectors
    
    :returns a vector representation of the token
    '''
    if token in word_embedding_model:
        vector = word_embedding_model[token]
    else:
        vector = [0]*300
    return vector


def extract_feature_values(row, selected_features):
    '''
    Function that extracts feature value pairs from row
    
    :param row: row from conll file
    :param selected_features: list of selected features
    :type row: string
    :type selected_features: list of strings
    
    :returns: dictionary of feature value pairs
    '''
    feature_values = {}
    for feature_name in selected_features:
        r_index = feature_to_index.get(feature_name)
        feature_values[feature_name] = row[r_index]
        
    return feature_values
    
    
def create_vectorizer_traditional_features(feature_values):
    '''
    Function that creates vectorizer for set of feature values
    
    :param feature_values: list of dictionaries containing feature-value pairs
    :type feature_values: list of dictionairies (key and values are strings)
    
    :returns: vectorizer with feature values fitted
    '''
    vectorizer = DictVectorizer()
    vectorizer.fit(feature_values)
    
    return vectorizer
        
    
def combine_sparse_and_dense_features(dense_vectors, sparse_features):
    '''
    Function that takes sparse and dense feature representations and appends their vector representation
    
    :param dense_vectors: list of dense vector representations
    :param sparse_features: list of sparse vector representations
    :type dense_vector: list of arrays
    :type sparse_features: list of lists
    
    :returns: list of arrays in which sparse and dense vectors are concatenated
    '''
    
    combined_vectors = []
    sparse_vectors = np.array(sparse_features.toarray())
    sparse_vectors.reshape(-1, 1)
    
    for index, vector in enumerate(sparse_vectors):
        combined_vector = np.concatenate((vector,dense_vectors[index]))
        combined_vectors.append(combined_vector)
    return combined_vectors
    

def extract_traditional_features_and_embeddings_plus_gold_labels(conllfile, word_embedding_model, vectorizer=None):
    '''
    Function that extracts traditional features as well as embeddings and gold labels using word embeddings for current and preceding token
    
    :param conllfile: path to conll file
    :param word_embedding_model: a pretrained word embedding model
    :type conllfile: string
    :type word_embedding_model: gensim.models.keyedvectors.Word2VecKeyedVectors
    
    :return features: list of vector representation of tokens
    :return labels: list of gold labels
    '''
    labels = []
    dense_vectors = []
    traditional_features = []
    
    conllinput = open(conllfile, 'r')
    csvreader = csv.reader(conllinput, delimiter='\t',quotechar='|')
    for row in csvreader:
        if len(row) == 4:
            token_vector = extract_word_embedding(row[0], word_embedding_model)
            pt_vector = extract_word_embedding(row[1], word_embedding_model)
            dense_vectors.append(np.concatenate((token_vector,pt_vector)))
            #mixing very sparse representations (for one-hot tokens) and dense representations is a bad idea
            #we thus only use other features with limited values
            other_features = extract_feature_values(row, ['pos','chunk'])
            traditional_features.append(other_features)
            #adding gold label to labels
            labels.append(row[-1])
            
    #create vector representation of traditional features
    if vectorizer is None:
        #creates vectorizer that provides mapping (only if not created earlier)
        vectorizer = create_vectorizer_traditional_features(traditional_features)
    sparse_features = vectorizer.transform(traditional_features)
    combined_vectors = combine_sparse_and_dense_features(dense_vectors, sparse_features)
    
    return combined_vectors, vectorizer, labels

def label_data_with_combined_features(testfile, classifier, vectorizer, word_embedding_model):
    '''
    Function that labels data with model using both sparse and dense features
    '''
    feature_vectors, vectorizer, goldlabels = extract_traditional_features_and_embeddings_plus_gold_labels(testfile, word_embedding_model, vectorizer)
    predictions = classifier.predict(feature_vectors)
    
    return predictions, goldlabels
feature_to_index = {'token': 0, 'pos': 1, 'chunk': 2}

In [51]:
print('Extracting Features...')
feature_vectors, vectorizer, gold_labels = extract_traditional_features_and_embeddings_plus_gold_labels(train_file, word_embedding_model)

Extracting Features...


In [52]:
print(len(feature_vectors))
print(len(gold_labels))

203621
203621


In [53]:
print('Training classifier....')
lr_classifier = create_classifier(feature_vectors, gold_labels)

Training classifier....


In [55]:
print('Classifying data...')
predicted, gold = label_data_using_word_embeddings_current_and_preceding(dev_file, word_embedding_model, lr_classifier)

Classifying data...


ValueError: Expected 2D array, got 1D array instead:
array=[].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
print('Running evaluation...')
print('CONFUSION MATRIX')
cf_matrix = confusion_matrix(gold, predicted)
print(cf_matrix)
print('CLASSIFICATION REPORT')
report = classification_report(gold,predicted,digits = 8)
print(report)