# Demonstration of data loading and model training with BERT vectors

In [1]:
import os
import json
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

In [2]:
ORIGINAL_DATA_DIR = os.path.join("..", "handout", "data")
BERT_FEATURE_DIR = "bert_output_data"

## Format training data

`X` will be a matrix with `N` rows for the `N` texts in the training data, and `M` columns for the `M` features generated by BERT.

`y` will be an array of `N` class labels for training.

In [3]:
train_df = pd.read_csv(os.path.join(ORIGINAL_DATA_DIR, "lang_id_train.csv"))

In [4]:
train_df.shape

(6000, 2)

In [5]:
bert_vectors = []
with open(os.path.join(BERT_FEATURE_DIR, "train.jsonlines"), "rt") as infile:
    for line in infile:
        bert_data = json.loads(line)
        for t in bert_data["features"]:
            # Only extract the [CLS] vector used for classification
            if t["token"] == "[CLS]":
                # We only use the representation at the final layer of the network
                bert_vectors.append(t["layers"][0]["values"])
                break

In [6]:
len(bert_vectors)

6000

In [7]:
X = np.array(bert_vectors)
y = train_df["native_language"].values

## Train logistic regression model

In [8]:
lr_model = LogisticRegression(penalty="l2", C=1.0)
lr_model.fit(X, y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)