In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m87.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m67.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf
import keras


**Load Data**

In [5]:
data = pd.read_csv("/content/drive/MyDrive/winemag-data_first150k.csv")
data

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude
...,...,...,...,...,...,...,...,...,...,...,...
150925,150925,Italy,Many people feel Fiano represents southern Ita...,,91,20.0,Southern Italy,Fiano di Avellino,,White Blend,Feudi di San Gregorio
150926,150926,France,"Offers an intriguing nose with ginger, lime an...",Cuvée Prestige,91,27.0,Champagne,Champagne,,Champagne Blend,H.Germain
150927,150927,Italy,This classic example comes from a cru vineyard...,Terre di Dora,91,20.0,Southern Italy,Fiano di Avellino,,White Blend,Terredora
150928,150928,France,"A perfect salmon shade, with scents of peaches...",Grand Brut Rosé,90,52.0,Champagne,Champagne,,Champagne Blend,Gosset


**Data Cleaning and  Imputing of null values**

In [6]:
#drop the index column
data=data.drop(columns=['Unnamed: 0'])
data["price"].describe()

count    137235.000000
mean         33.131482
std          36.322536
min           4.000000
25%          16.000000
50%          24.000000
75%          40.000000
max        2300.000000
Name: price, dtype: float64

In [7]:
#Convert text features into sentences
# data['country'] = 'The country of this wine is ' + data['country'] + '. '
# data['designation'] = 'The designation of this wine is ' + data['designation'] + '. '
# data['province'] = 'The province of this wine is ' + data['province'] + '. '
# data['region_1'] = 'The region of this wine is ' + data['region_1'] + '. '
# data['region_2'] = 'The secondary region of this wine is ' + data['region_2'] + '. '
# data['variety'] = 'The variety of this wine is ' + data['variety'] + '. '
# data['winery'] = 'The winery from this wine is ' + data['winery'] + '. '
# data['points'] = 'The rating given to this wine is ' + data["points"].astype("string") + ". "

In [8]:
data['description'].isna().sum()

0

In [9]:
# drop null and duplicate values
data.dropna(subset=['price', "points"], inplace=True)
data.drop_duplicates(subset=['description'], inplace=True)
data.nunique()

country           46
description    89108
designation    28345
points            21
price            357
province         446
region_1        1197
region_2          18
variety          619
winery         13852
dtype: int64

In [10]:
#replace nan values for text features with empty string
data[['country', 'designation', 'region_1', 'region_2', 'province', 'variety', 'winery']] = data[['country','designation', 'region_1', 'region_2', 'province', 'variety', 'winery']].fillna("")
data.isnull().sum()

country        0
description    0
designation    0
points         0
price          0
province       0
region_1       0
region_2       0
variety        0
winery         0
dtype: int64

In [11]:
bins = pd.qcut(data["price"], q=4)
print(bins)

0         (40.0, 2300.0]
1         (40.0, 2300.0]
2         (40.0, 2300.0]
3         (40.0, 2300.0]
4         (40.0, 2300.0]
               ...      
149634     (3.999, 16.0]
149635     (3.999, 16.0]
149637     (3.999, 16.0]
149638     (3.999, 16.0]
149639     (3.999, 16.0]
Name: price, Length: 89108, dtype: category
Categories (4, interval[float64, right]): [(3.999, 16.0] < (16.0, 25.0] < (25.0, 40.0] <
                                           (40.0, 2300.0]]


In [12]:
categories = bins.cat.categories.tolist()
target_names = [str(i) for i in categories]

In [13]:
bins.cat.codes

0         3
1         3
2         3
3         3
4         3
         ..
149634    0
149635    0
149637    0
149638    0
149639    0
Length: 89108, dtype: int8

In [14]:
data["price_categorical"] = bins.cat.codes
data

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery,price_categorical
0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz,3
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez,3
2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley,3
3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi,3
4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude,3
...,...,...,...,...,...,...,...,...,...,...,...
149634,France,Atypically light in body and reticent on the n...,,84,15.0,Alsace,Alsace,,Gewürztraminer,W. Gisselbrecht,0
149635,US,A Syrah-Grenache blend that's dry and rustical...,Bungalow Red,84,15.0,California,Santa Barbara County,Central Coast,Syrah-Grenache,Casa Barranca,0
149637,US,"Outside of the vineyard, wines like this are w...",,84,6.0,California,California,California Other,Merlot,Delicato,0
149638,Argentina,"Heavy and basic, with melon and pineapple arom...",,84,9.0,Mendoza Province,Uco Valley,,Sauvignon Blanc,Finca El Portillo,0


In [15]:
data['description'] = data['description'].astype(str)
data['description_noDigits'] = data['description'].str.replace(r'\d', '', regex=True)

In [16]:
# data["description_features"] = data["description_noDigits"] + " " + data["points"] + data['variety'] + data['country'] + data['province'] + data["region_1"]
# # + data['winery'] + data["region_2"] + data["designation"]
# data["description_features"].iloc[0]

**Country Selection**

In [17]:
data.groupby(['country']).count().sort_values(by='description_noDigits', ascending=False).head(5)
# data['country'] = data['country'].astype(str)
# data
us_data = data[data['country'] == "US"]
italy_data = data[data['country'] == "Italy"]
france_data = data[data['country'] == "France"]

us_data

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery,price_categorical,description_noDigits
0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz,3,This tremendous % varietal wine hails from Oak...
2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley,3,Mac Watson honors the memory of a wine once ma...
3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi,3,"This spent months in % new French oak, and in..."
8,US,This re-named vineyard was formerly bottled as...,Silice,95,65.0,Oregon,Chehalem Mountains,Willamette Valley,Pinot Noir,Bergström,3,This re-named vineyard was formerly bottled as...
9,US,The producer sources from two blocks of the vi...,Gap's Crown Vineyard,95,60.0,California,Sonoma Coast,Sonoma,Pinot Noir,Blue Farm,3,The producer sources from two blocks of the vi...
...,...,...,...,...,...,...,...,...,...,...,...,...
149629,US,"Half Merlot, with the rest Cabernet Sauvignon,...",Five-O,84,25.0,New York,North Fork of Long Island,Long Island,Red Blend,Martha Clara,1,"Half Merlot, with the rest Cabernet Sauvignon,..."
149630,US,Tastes kind of soft and thick in jammy cherry ...,Crimson Creek,84,27.0,California,Napa Valley,Napa,Merlot,Pine Ridge,2,Tastes kind of soft and thick in jammy cherry ...
149633,US,"Mushroom and tomato are coated with spicy oak,...",,84,40.0,New York,"The Hamptons, Long Island",Long Island,Cabernet Franc,Wölffer,2,"Mushroom and tomato are coated with spicy oak,..."
149635,US,A Syrah-Grenache blend that's dry and rustical...,Bungalow Red,84,15.0,California,Santa Barbara County,Central Coast,Syrah-Grenache,Casa Barranca,0,A Syrah-Grenache blend that's dry and rustical...


In [18]:
data_selection = 10000

sample_us_data = us_data.sample(data_selection)
sample_italy_data = italy_data.sample(data_selection)
sample_france_data = france_data.sample(data_selection)
# (train_texts, test_texts, train_labels, test_labels) = train_test_split(list(data["description_features"][:data_selection]), list(df["price_categorical"][:data_selection]), test_size=0.2, stratify=list(df["price_categorical"][:data_selection]), random_state=1)

(us_train_data, us_test_data) = train_test_split(sample_us_data, test_size=0.2, random_state=1)

(italy_train_data, italy_test_data) = train_test_split(sample_italy_data, test_size=0.2, random_state=1)

(france_train_data, france_test_data) = train_test_split(sample_france_data, test_size=0.2, random_state=1)
france_train_data


Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery,price_categorical,description_noDigits
57211,France,A blend that includes three ancient Champagne ...,Quattuor Blanc de Blancs Brut,90,70.0,Champagne,Champagne,,White Blend,Drappier,3,A blend that includes three ancient Champagne ...
26454,France,"All red cherry and banana fruit, this is a sof...",Nouveau,84,11.0,Beaujolais,Beaujolais-Villages,,Gamay,Georges Duboeuf,0,"All red cherry and banana fruit, this is a sof..."
43913,France,A few years of age have added some interest to...,Cuvée Les Amours,85,17.0,Alsace,Alsace,,Pinot Blanc,Hugel,1,A few years of age have added some interest to...
59385,France,From a vineyard that is being converted to bio...,Les Cocainelles,88,23.0,Loire Valley,Anjou,,Cabernet Franc,Domaine des Grandes Vignes,1,From a vineyard that is being converted to bio...
8763,France,Soft notes of earth and gentle red fruit appea...,Collection,86,29.0,Alsace,Alsace,,Pinot Noir,Kuentz-Bas,2,Soft notes of earth and gentle red fruit appea...
...,...,...,...,...,...,...,...,...,...,...,...,...
83855,France,This is a lightweight wine with fruit that tas...,,83,14.0,Bordeaux,Bordeaux Supérieur,,Bordeaux-style Red Blend,Chateau le Grand Verdus,0,This is a lightweight wine with fruit that tas...
27356,France,This pleasantly perfumed wine has some acidity...,Les Charmes Premier Cru,89,141.0,Burgundy,Chambolle-Musigny,,Pinot Noir,Joseph Faiveley,3,This pleasantly perfumed wine has some acidity...
15544,France,"From grand cru vines in Avize, Oger, Cramant a...",Terroirs Blanc de Blancs Brut,90,50.0,Champagne,Champagne,,Chardonnay,Agrapart & Fils,3,"From grand cru vines in Avize, Oger, Cramant a..."
32476,France,"While the wood aging is prominent, it doesn't ...",Intense,90,13.0,Bordeaux,Bordeaux,,Bordeaux-style Red Blend,Château Lamothe-Vincent,0,"While the wood aging is prominent, it doesn't ..."


In [19]:
# (train_texts, valid_texts, train_labels, valid_labels) = train_test_split(train_texts, train_labels, test_size=0.25, stratify=train_labels, random_state=1)
# type(train_texts)

(us_train_data, us_valid_data) = train_test_split(us_train_data, test_size=0.25, random_state=1)
(italy_train_data, italy_valid_data) = train_test_split(italy_train_data, test_size=0.25, random_state=1)
(france_train_data, france_valid_data) = train_test_split(france_train_data, test_size=0.25, random_state=1)

In [20]:
model_checkpoint = 'bert-base-cased'
bert_tokenizer = BertTokenizer.from_pretrained(model_checkpoint)
bert_model = TFBertModel.from_pretrained(model_checkpoint)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [21]:
us_percs = pd.qcut(us_train_data["description_noDigits"].str.len(), q=10)
print(us_percs)

italy_percs = pd.qcut(italy_train_data["description_noDigits"].str.len(), q=10)
print(italy_percs)

france_percs = pd.qcut(france_train_data["description_noDigits"].str.len(), q=10)
print(france_percs)

140505    (215.0, 234.0]
138201    (253.0, 276.0]
66466     (253.0, 276.0]
68626     (195.0, 215.0]
583       (253.0, 276.0]
               ...      
50045     (215.0, 234.0]
88994     (195.0, 215.0]
85068     (340.0, 629.0]
5630      (276.0, 302.0]
5441      (195.0, 215.0]
Name: description_noDigits, Length: 6000, dtype: category
Categories (10, interval[float64, right]): [(17.999, 142.0] < (142.0, 173.8] < (173.8, 195.0] <
                                            (195.0, 215.0] ... (253.0, 276.0] < (276.0, 302.0] <
                                            (302.0, 340.0] < (340.0, 629.0]]
56401      (282.0, 313.0]
22934      (313.0, 616.0]
20097      (208.0, 221.0]
112583     (193.8, 208.0]
30126     (71.999, 174.0]
               ...       
80081      (262.3, 282.0]
88264      (247.0, 262.3]
146281     (234.0, 247.0]
42244      (174.0, 193.8]
67692      (262.3, 282.0]
Name: description_noDigits, Length: 6000, dtype: category
Categories (10, interval[float64, right]): [(71.999, 

In [22]:
us_max_length = 300
italy_max_length = 285
france_max_length = 270

In [23]:
def create_bert_model(bert_model,
                      experiment,
                      num_classes=4,
                      num_train_layers=0,
                      hidden_size = 256,
                      dropout=0.3,
                      learning_rate=1e-5):

    bert_model.trainable = True

    if experiment == "us":
      input_ids = tf.keras.layers.Input(shape=(us_max_length,), dtype=tf.int64, name='input_ids_layer')
      token_type_ids = tf.keras.layers.Input(shape=(us_max_length,), dtype=tf.int64, name='token_type_ids_layer')
      attention_mask = tf.keras.layers.Input(shape=(us_max_length,), dtype=tf.int64, name='attention_mask_layer')
    elif experiment == "italy":
      input_ids = tf.keras.layers.Input(shape=(italy_max_length,), dtype=tf.int64, name='input_ids_layer')
      token_type_ids = tf.keras.layers.Input(shape=(italy_max_length,), dtype=tf.int64, name='token_type_ids_layer')
      attention_mask = tf.keras.layers.Input(shape=(italy_max_length,), dtype=tf.int64, name='attention_mask_layer')
    else:
      input_ids = tf.keras.layers.Input(shape=(france_max_length,), dtype=tf.int64, name='input_ids_layer')
      token_type_ids = tf.keras.layers.Input(shape=(france_max_length,), dtype=tf.int64, name='token_type_ids_layer')
      attention_mask = tf.keras.layers.Input(shape=(france_max_length,), dtype=tf.int64, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                   'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask}

    bert_out = bert_model(bert_inputs)

    cls_token = bert_out[0][:, 0, :]
    dropout_layer = tf.keras.layers.Dropout(dropout)(cls_token)
    hidden_layer = tf.keras.layers.Dense(hidden_size, activation='swish', name='hidden_layer')(dropout_layer)
    hidden_layer_2 = tf.keras.layers.Dense(hidden_size/2, activation='swish', name='hidden_layer_2')(hidden_layer)

    classification = tf.keras.layers.Dense(num_classes, activation='softmax',name='classification_layer')(hidden_layer)

    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=[classification])
    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
      loss = 'sparse_categorical_crossentropy',
            metrics = 'accuracy')

    return classification_model

In [24]:
us_cls_bert_model = create_bert_model(bert_model, experiment="us")

us_nptrain_labels = np.asarray(us_train_data['price_categorical'])
us_npvalid_labels = np.asarray(us_valid_data['price_categorical'])
us_nptest_labels = np.asarray(us_test_data['price_categorical'])

us_train_encodings = bert_tokenizer(list(us_train_data['description_noDigits']), truncation=True, padding="max_length", max_length=us_max_length, return_tensors='tf')
us_valid_encodings = bert_tokenizer(list(us_valid_data['description_noDigits']), truncation=True, padding="max_length", max_length=us_max_length, return_tensors='tf')
us_test_encodings = bert_tokenizer(list(us_test_data['description_noDigits']), truncation=True, padding="max_length", max_length=us_max_length, return_tensors='tf')

us_cls_bert_model.fit(x=[us_train_encodings.input_ids, us_train_encodings.token_type_ids, us_train_encodings.attention_mask],
                                              y=us_nptrain_labels,
                                              validation_data=([us_valid_encodings.input_ids, us_valid_encodings.token_type_ids, us_valid_encodings.attention_mask],
                                              us_npvalid_labels),
                                              shuffle=True,
                                              batch_size=8,
                                              validation_batch_size=8,
                                              epochs=4)

us_predictions = us_cls_bert_model.predict([us_test_encodings.input_ids, us_test_encodings.token_type_ids, us_test_encodings.attention_mask])
us_predictions = tf.argmax(us_predictions, axis=-1)
print(classification_report(us_nptest_labels, us_predictions.numpy(), target_names=target_names))

us_score = us_cls_bert_model.evaluate([us_test_encodings.input_ids, us_test_encodings.token_type_ids, us_test_encodings.attention_mask],
                                              us_nptest_labels)

print('US test loss:', us_score[0])
print('US test accuracy:', us_score[1])

Epoch 1/4




Epoch 2/4
Epoch 3/4
Epoch 4/4
                precision    recall  f1-score   support

 (3.999, 16.0]       0.61      0.48      0.53       374
  (16.0, 25.0]       0.40      0.50      0.44       516
  (25.0, 40.0]       0.45      0.40      0.42       602
(40.0, 2300.0]       0.58      0.58      0.58       508

      accuracy                           0.49      2000
     macro avg       0.51      0.49      0.49      2000
  weighted avg       0.50      0.49      0.49      2000

US test loss: 1.292057991027832
US test accuracy: 0.48649999499320984


In [25]:
italy_cls_bert_model = create_bert_model(bert_model, experiment="italy")

italy_nptrain_labels = np.asarray(italy_train_data['price_categorical'])
italy_npvalid_labels = np.asarray(italy_valid_data['price_categorical'])
italy_nptest_labels = np.asarray(italy_test_data['price_categorical'])

italy_train_encodings = bert_tokenizer(list(italy_train_data['description_noDigits']), truncation=True, padding="max_length", max_length=italy_max_length, return_tensors='tf')
italy_valid_encodings = bert_tokenizer(list(italy_valid_data['description_noDigits']), truncation=True, padding="max_length", max_length=italy_max_length, return_tensors='tf')
italy_test_encodings = bert_tokenizer(list(italy_test_data['description_noDigits']), truncation=True, padding="max_length", max_length=italy_max_length, return_tensors='tf')

italy_cls_bert_model.fit(x=[italy_train_encodings.input_ids, italy_train_encodings.token_type_ids, italy_train_encodings.attention_mask],
                                              y=italy_nptrain_labels,
                                              validation_data=([italy_valid_encodings.input_ids, italy_valid_encodings.token_type_ids, italy_valid_encodings.attention_mask],
                                              italy_npvalid_labels),
                                              shuffle=True,
                                              batch_size=8,
                                              validation_batch_size=8,
                                              epochs=4)

italy_predictions = italy_cls_bert_model.predict([italy_test_encodings.input_ids, italy_test_encodings.token_type_ids, italy_test_encodings.attention_mask])
italy_predictions = tf.argmax(italy_predictions, axis=-1)
print(classification_report(italy_nptest_labels, italy_predictions.numpy(), target_names=target_names))

italy_score = italy_cls_bert_model.evaluate([italy_test_encodings.input_ids, italy_test_encodings.token_type_ids, italy_test_encodings.attention_mask],
                                              italy_nptest_labels)

print('Italy test loss:', italy_score[0])
print('Italy test accuracy:', italy_score[1])

Epoch 1/4




Epoch 2/4
Epoch 3/4
Epoch 4/4
                precision    recall  f1-score   support

 (3.999, 16.0]       0.61      0.45      0.52       431
  (16.0, 25.0]       0.44      0.49      0.47       550
  (25.0, 40.0]       0.35      0.24      0.28       414
(40.0, 2300.0]       0.62      0.80      0.70       605

      accuracy                           0.53      2000
     macro avg       0.51      0.50      0.49      2000
  weighted avg       0.51      0.53      0.51      2000

Italy test loss: 1.2896758317947388
Italy test accuracy: 0.5254999995231628


In [26]:
france_cls_bert_model = create_bert_model(bert_model, experiment="france")

france_nptrain_labels = np.asarray(france_train_data['price_categorical'])
france_npvalid_labels = np.asarray(france_valid_data['price_categorical'])
france_nptest_labels = np.asarray(france_test_data['price_categorical'])

france_train_encodings = bert_tokenizer(list(france_train_data['description_noDigits']), truncation=True, padding="max_length", max_length=france_max_length, return_tensors='tf')
france_valid_encodings = bert_tokenizer(list(france_valid_data['description_noDigits']), truncation=True, padding="max_length", max_length=france_max_length, return_tensors='tf')
france_test_encodings = bert_tokenizer(list(france_test_data['description_noDigits']), truncation=True, padding="max_length", max_length=france_max_length, return_tensors='tf')

france_cls_bert_model.fit(x=[france_train_encodings.input_ids, france_train_encodings.token_type_ids, france_train_encodings.attention_mask],
                                              y=france_nptrain_labels,
                                              validation_data=([france_valid_encodings.input_ids, france_valid_encodings.token_type_ids, france_valid_encodings.attention_mask],
                                              france_npvalid_labels),
                                              shuffle=True,
                                              batch_size=8,
                                              validation_batch_size=8,
                                              epochs=4)

france_predictions = france_cls_bert_model.predict([france_test_encodings.input_ids, france_test_encodings.token_type_ids, france_test_encodings.attention_mask])
france_predictions = tf.argmax(france_predictions, axis=-1)
print(classification_report(france_nptest_labels, france_predictions.numpy(), target_names=target_names))

france_score = france_cls_bert_model.evaluate([france_test_encodings.input_ids, france_test_encodings.token_type_ids, france_test_encodings.attention_mask],
                                              france_nptest_labels)

print('France test loss:', france_score[0])
print('France test accuracy:', france_score[1])

Epoch 1/4




Epoch 2/4
Epoch 3/4
Epoch 4/4
                precision    recall  f1-score   support

 (3.999, 16.0]       0.63      0.58      0.60       490
  (16.0, 25.0]       0.39      0.51      0.44       502
  (25.0, 40.0]       0.32      0.14      0.20       375
(40.0, 2300.0]       0.62      0.69      0.65       633

      accuracy                           0.52      2000
     macro avg       0.49      0.48      0.47      2000
  weighted avg       0.50      0.52      0.50      2000

France test loss: 1.2902625799179077
France test accuracy: 0.5164999961853027
