In [None]:
# dependencies for BERT model
import torch
from transformers import BertTokenizer, BertModel
import numpy as np
import pandas as pd
import os
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer

In [1]:
# transformer architecture code
def get_bert_embeddings(text):
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    # Tokenize input
    tokenized_text = tokenizer.tokenize(text)
    # Convert token to vocabulary indices
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    # Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
    segments_ids = [1] * len(tokenized_text)
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    # Load pre-trained model (weights)
    model = BertModel.from_pretrained('bert-base-uncased')
    # Put the model in "evaluation" mode, meaning feed-forward operation.
    model.eval()
    # Predict hidden states features for each layer
    with torch.no_grad():
        encoded_layers, _ = model(tokens_tensor, segments_tensors)
    # Concatenate the tensors for all layers. We use `stack` here to create a new dimension in the tensor.
    token_embeddings = torch.stack(encoded_layers, dim=0)
    token_embeddings.size()
    # Remove dimension 1, the "batches".
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    token_embeddings.size()
    # Swap dimensions 0 and 1.
    token_embeddings = token_embeddings.permute(1,0,2)
    token_embeddings.size()
    # `encoded_layers` is a Python list.
    # Each layer in the list is a torch tensor.
    # Each tensor has the shape [number of tokens x number of hidden units]
    # Concatenate the tensors for all layers. We use `stack` here to create a new dimension in the tensor.
    token_embeddings = torch.stack(encoded_layers, dim=0)
    # Remove dimension 1, the "batches".
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    # Swap dimensions 0 and 1.
    token_embeddings = token_embeddings.permute(1,0,2)
    # Stores the token vectors, with shape [22 x 12 x 768]
    token_vecs_cat = []
    # `token_embeddings` is a [22 x 12 x 768] tensor.
    # For each token in the sentence...
    for token in token_embeddings:
        # `token` is a [12 x 768] tensor
        # Concatenate the vectors (that is, append them together) from the last four layers.
        # Each layer vector is 768 values, so `cat_vec` is length 3,072.
        cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
        # Use `cat_vec` to represent `token`.
        token_vecs_cat.append(cat_vec)

In [2]:
# git large files issue
# https://www.youtube.com/watch?v=TXSmxtU2tOk

In [3]:
from datetime import datetime
def transform_date_format(dates):
    transformed_dates = []
    for date_str in dates:
        try:
            date = datetime.strptime(date_str, "%Y/%m/%d")
            transformed_dates.append(date.strftime("%Y/%d/%m"))

        except ValueError:
            try:
                date = datetime.strptime(date_str, "%m-%d-%Y")
                transformed_dates.append(date.strftime("%Y%d%m"))

            except ValueError:
                try:
                    date = datetime.strptime(date_str.replace(" ", ""), "%Y%D%M%p")
                    transformed_dates.append(date.strftime("%Y%d%m"))

                except ValueError:
                    pass

    return transformed_dates

In [4]:
dates = transform_date_format(["2010/02/20", "2 016p 19p 12", "11-18-2012", "2018 12 24", "20130720"])
print(*dates, sep='\n')

2010/20/02
20121811
