# Demonstration of data loading and model training with BERT vectors

In [5]:
import os
import json
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

In [13]:
ORIGINAL_DATA_DIR = os.path.join("..", "handout", "data")
BERT_FEATURE_DIR = os.path.join("..", "bert", "bert_output_data")

## Format training data

`X` will be a matrix with `N` rows for the `N` texts in the training data, and `M` columns for the `M` features generated by BERT.

`y` will be an array of `N` class labels for training.

In [14]:
train_df = pd.read_csv(os.path.join(ORIGINAL_DATA_DIR, "lang_id_train.csv"))

In [15]:
train_df.shape

(6000, 2)

In [16]:
train_df

Unnamed: 0,native_language,text
0,Japanese,"who, whom, whose I have often thinking that, I..."
1,Japanese,books I love reading. It's been one of my hobb...
2,Japanese,change job I want to change job. The company t...
3,Japanese,the origin of Japanese I like to learn about a...
4,Japanese,"Marunouchi Today, I am going go to Marunouchi,..."
5,Japanese,My pleasure My pleasure. I wrote diaries twice...
6,Japanese,Darts ( Hobby ) I am going to play darts game ...
7,Japanese,"difficult:( Hi, im sorry for renew many times:..."
8,Japanese,"long time no update lol Hi, I could not update..."
9,Japanese,"Drinking Party Yesterday, I went to Harajuku t..."


In [17]:
bert_vectors = []
with open(os.path.join(BERT_FEATURE_DIR, "train.jsonlines"), "rt") as infile:
    for line in infile:
        bert_data = json.loads(line)
        for t in bert_data["features"]:
            # Only extract the [CLS] vector used for classification
            if t["token"] == "[CLS]":
                # We only use the representation at the final layer of the network
                bert_vectors.append(t["layers"][0]["values"])
                break

In [18]:
len(bert_vectors)

6000

In [19]:
X = np.array(bert_vectors)
y = train_df["native_language"].values

array([[ 0.029188, -0.017075,  0.035836, ..., -0.024325,  0.17578 ,
        -0.17903 ],
       [ 0.344251,  0.088796,  0.09354 , ..., -0.080333,  0.213616,
         0.465394],
       [-0.029146,  0.431235,  0.454688, ..., -0.512232,  0.317562,
         0.262735],
       ...,
       [ 0.228317,  0.186384,  0.583934, ..., -0.417311,  1.083994,
        -0.083887],
       [ 0.517324,  0.113897, -0.217372, ..., -0.476591,  0.479121,
         0.007945],
       [ 0.343263, -0.31379 ,  0.298612, ..., -0.16599 ,  0.169354,
         0.292019]])

## Train logistic regression model

In [20]:
lr_model = LogisticRegression(penalty="l2", C=1.0)
lr_model.fit(X, y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [None]:
def get_clf