In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -q transformers

In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf
import keras

In [None]:
#Functions

def BERT_data_processing(raw_data):
  processed_data = txt_2_sent(raw_data)
  processed_data = null_force_drop(processed_data)

  #replace nan values for text features with empty string
  processed_data[['country', 'designation', 'region_1', 'region_2', 'province']] = processed_data[['country','designation', 'region_1', 'region_2', 'province']].fillna("")
  processed_data.isnull().sum()

  processed_data["description_features"] = processed_data["description"] + " " + processed_data['country'] + processed_data["designation"] + processed_data["region_1"] + processed_data['region_2'] + processed_data['province']
  processed_data["description_features"].iloc[0]
  return processed_data

#Convert text features into sentences
def txt_2_sent(data_to_change):
  if "The country of this wine" in data_to_change['country'][0]:
    return data_to_change
  data_to_change['country'] = 'The country of this wine is ' + data_to_change['country'] + '. '
  data_to_change['designation'] = 'The designation of this wine is ' + data_to_change['designation'] + '. '
  data_to_change['province'] = 'The province of this wine is ' + data_to_change['province'] + '. '
  data_to_change['region_1'] = 'The region of this wine is ' + data_to_change['region_1'] + '. '
  data_to_change['region_2'] = 'The secondary region of this wine is ' + data_to_change['region_2'] + '. '
  data_to_change['variety'] = 'The variety of this wine is ' + data_to_change['variety'] + '. '
  data_to_change['winery'] = 'The winery from this wine is ' + data_to_change['winery'] + '. '
  data_to_change['points'] = 'The rating given to this wine is ' + data_to_change["points"].astype("string") + ". "

  #replace nan values for text features with empty string
  data_to_change[['country', 'designation', 'region_1', 'region_2', 'province']] = data_to_change[['country','designation', 'region_1', 'region_2', 'province']].fillna("")
  data_to_change.isnull().sum()

  data_to_change["description_features"] = data_to_change["description"] + " " + data_to_change['country'] + data_to_change["designation"] + data_to_change["region_1"] + data_to_change['region_2'] + data_to_change['province']
  print(data_to_change["description_features"].iloc[0])

  return data_to_change


# drop null and duplicate values
def null_force_drop(data_to_change):
  data_to_change.dropna(subset=['price', "points"], inplace=True)
  data_to_change.drop_duplicates(subset=['description'], inplace=True)
  print(data_to_change.nunique())
  return data_to_change

**Load Data**

In [None]:
data = pd.read_csv("/content/drive/MyDrive/winemag-data_first150k.csv")
#drop the index column
data=data.drop(columns=['Unnamed: 0'])

In [None]:
data['description'] = data['description'].astype(str)
data['description_noDigits'] = data['description'].str.replace(r'\d', '', regex=True)

BERT_regression_data = data[["description_noDigits", "price"]]

This tremendous 100% varietal wine hails from Oakville and was aged over three years in oak. Juicy red-cherry fruit and a compelling hint of caramel greet the palate, framed by elegant, fine tannins and a subtle minty tone in the background. Balanced and rewarding from start to finish, it has years ahead of it to develop further nuance. Enjoy 2022–2030. The country of this wine is US. The designation of this wine is Martha's Vineyard. The region of this wine is Napa Valley. The secondary region of this wine is Napa. The province of this wine is California. 


**Data Cleaning and  Imputing of null values**

In [None]:
regression_data = data
regression_data["price"].describe()

count    137235.000000
mean         33.131482
std          36.322536
min           4.000000
25%          16.000000
50%          24.000000
75%          40.000000
max        2300.000000
Name: price, dtype: float64

In [None]:
#data = txt_2_sent(data)
#data = null_force_drop(data)
regression_data.drop_duplicates(subset=['description'], inplace=True)
regression_data.dropna(subset=['price', "points"], inplace=True)
print(regression_data.nunique())

country                    47
description             89108
designation             28346
points                     21
price                     357
province                  447
region_1                 1198
region_2                   19
variety                   619
winery                  13852
description_features    89108
dtype: int64


In [None]:
#Creating an alternate dataset for the non-NLP model
non_BERT_regression_data = regression_data

# Dropping description category
non_BERT_regression_data = non_BERT_regression_data.drop(['description'], axis=1)

# Rearranging columns to have price at the end
non_BERT_regression_data = non_BERT_regression_data[[c for c in non_BERT_regression_data if c not in ['price']]
       + ['price']]
non_BERT_regression_data = non_BERT_regression_data.dropna()

In [None]:
regression_data

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery,description_features
0,The country of this wine is US.,This tremendous 100% varietal wine hails from ...,The designation of this wine is Martha's Viney...,The rating given to this wine is 96.,235.0,The province of this wine is California.,The region of this wine is Napa Valley.,The secondary region of this wine is Napa.,The variety of this wine is Cabernet Sauvignon.,The winery from this wine is Heitz.,This tremendous 100% varietal wine hails from ...
1,The country of this wine is Spain.,"Ripe aromas of fig, blackberry and cassis are ...",The designation of this wine is Carodorum Sele...,The rating given to this wine is 96.,110.0,The province of this wine is Northern Spain.,The region of this wine is Toro.,,The variety of this wine is Tinta de Toro.,The winery from this wine is Bodega Carmen Rod...,"Ripe aromas of fig, blackberry and cassis are ..."
2,The country of this wine is US.,Mac Watson honors the memory of a wine once ma...,The designation of this wine is Special Select...,The rating given to this wine is 96.,90.0,The province of this wine is California.,The region of this wine is Knights Valley.,The secondary region of this wine is Sonoma.,The variety of this wine is Sauvignon Blanc.,The winery from this wine is Macauley.,Mac Watson honors the memory of a wine once ma...
3,The country of this wine is US.,"This spent 20 months in 30% new French oak, an...",The designation of this wine is Reserve.,The rating given to this wine is 96.,65.0,The province of this wine is Oregon.,The region of this wine is Willamette Valley.,The secondary region of this wine is Willamett...,The variety of this wine is Pinot Noir.,The winery from this wine is Ponzi.,"This spent 20 months in 30% new French oak, an..."
4,The country of this wine is France.,"This is the top wine from La Bégude, named aft...",The designation of this wine is La Brûlade.,The rating given to this wine is 95.,66.0,The province of this wine is Provence.,The region of this wine is Bandol.,,The variety of this wine is Provence red blend.,The winery from this wine is Domaine de la Bég...,"This is the top wine from La Bégude, named aft..."
...,...,...,...,...,...,...,...,...,...,...,...
149634,The country of this wine is France.,Atypically light in body and reticent on the n...,,The rating given to this wine is 84.,15.0,The province of this wine is Alsace.,The region of this wine is Alsace.,,The variety of this wine is Gewürztraminer.,The winery from this wine is W. Gisselbrecht.,Atypically light in body and reticent on the n...
149635,The country of this wine is US.,A Syrah-Grenache blend that's dry and rustical...,The designation of this wine is Bungalow Red.,The rating given to this wine is 84.,15.0,The province of this wine is California.,The region of this wine is Santa Barbara County.,The secondary region of this wine is Central C...,The variety of this wine is Syrah-Grenache.,The winery from this wine is Casa Barranca.,A Syrah-Grenache blend that's dry and rustical...
149637,The country of this wine is US.,"Outside of the vineyard, wines like this are w...",,The rating given to this wine is 84.,6.0,The province of this wine is California.,The region of this wine is California.,The secondary region of this wine is Californi...,The variety of this wine is Merlot.,The winery from this wine is Delicato.,"Outside of the vineyard, wines like this are w..."
149638,The country of this wine is Argentina.,"Heavy and basic, with melon and pineapple arom...",,The rating given to this wine is 84.,9.0,The province of this wine is Mendoza Province.,The region of this wine is Uco Valley.,,The variety of this wine is Sauvignon Blanc.,The winery from this wine is Finca El Portillo.,"Heavy and basic, with melon and pineapple arom..."


In [None]:
for i in non_BERT_regression_data.columns:
  non_BERT_regression_data[i] = pd.factorize(non_BERT_regression_data[i])[0] + 1
  if i == 'price':
    break
  print(i)

country
designation
points
province
region_1
region_2
variety
winery
description_features


In [None]:
# Normalizing price category
column = "price"
non_BERT_regression_data[column] = (non_BERT_regression_data[column] - non_BERT_regression_data[column].min()) / (non_BERT_regression_data[column].max() - non_BERT_regression_data[column].min())

**Feature Selection**

In [None]:
# df = data[["description_features", "price_categorical"]]
# df = data[["description", "points_categorical"]]
non_BERT_df = non_BERT_regression_data[[c for c in non_BERT_regression_data if c not in ['price']]]
non_BERT_price = non_BERT_regression_data["price"]

In [None]:
data_selection = 10000
sample_data = data.sample(data_selection)
def generate_training_data(input_data_frame, output_data_frame):
  (train_texts, test_texts, train_labels, test_labels) = train_test_split(input_data_frame, output_data_frame, test_size=0.2)
  (train_texts, valid_texts, train_labels, valid_labels) = train_test_split(train_texts, train_labels, test_size=0.25)
  type(train_texts)
  return (train_texts, valid_texts, test_texts, train_labels, valid_labels, test_labels)

In [None]:
(train_texts, test_texts, train_labels, test_labels, valid_texts, valid_labels) = generate_training_data(non_BERT_df, non_BERT_price)

In [None]:
from sklearn import linear_model
regr = linear_model.LinearRegression() # Do not use fit_intercept = False if you have removed 1 column after dummy encoding
regr.fit(train_texts, train_labels)
predicted = regr.predict(valid_texts)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree=3)
X_poly = poly_reg.fit_transform(train_texts)
X_test = poly_reg.fit_transform(test_texts)
pol_reg = linear_model.LinearRegression()
pol_reg.fit(X_poly, train_labels)

predicted = pol_reg.predict(X_poly)
print(f'Train RMSE: {np.sqrt(np.mean(np.square(predicted-train_labels)))}')

predicted = pol_reg.predict(X_test)
print(f'Test RMSE: {np.sqrt(np.mean(np.square(predicted-test_labels)))}')


Train RMSE: 0.07342859308719588
Test RMSE: 0.07336311444522242


In [None]:
model_checkpoint = 'bert-base-cased'
bert_tokenizer = BertTokenizer.from_pretrained(model_checkpoint)
bert_model = TFBertModel.from_pretrained(model_checkpoint)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [None]:
percs = pd.qcut(BERT_regression_data["description_noDigits"].str.len(), q=10)
percs

0         (332.0, 829.0]
1         (297.0, 332.0]
2         (273.0, 297.0]
3         (332.0, 829.0]
4         (332.0, 829.0]
               ...      
149634    (204.0, 221.0]
149635    (204.0, 221.0]
149637    (273.0, 297.0]
149638    (185.0, 204.0]
149639    (254.0, 273.0]
Name: description, Length: 89108, dtype: category
Categories (10, interval[float64, right]): [(16.999, 158.0] < (158.0, 185.0] < (185.0, 204.0] <
                                            (204.0, 221.0] ... (254.0, 273.0] < (273.0, 297.0] <
                                            (297.0, 332.0] < (332.0, 829.0]]

In [None]:
max_length = 335

In [None]:
# Normalizing price category
column = "price"
BERT_regression_data[column] = (BERT_regression_data[column] - BERT_regression_data[column].min()) / (BERT_regression_data[column].max() - BERT_regression_data[column].min())

In [None]:
sample_data = BERT_regression_data.sample(data_selection)

In [None]:
(train_texts, valid_texts, test_texts, train_labels, valid_labels, test_labels) = generate_training_data(sample_data['description_noDigits'], sample_data['price'])

In [None]:
train_encodings = bert_tokenizer(list(train_texts), truncation=True, padding="max_length", max_length=max_length, return_tensors='tf')
valid_encodings = bert_tokenizer(list(valid_texts), truncation=True, padding="max_length", max_length=max_length, return_tensors='tf')
test_encodings = bert_tokenizer(list(test_texts), truncation=True, padding="max_length", max_length=max_length, return_tensors='tf')

In [None]:
def create_bert_model(bert_model,
                                 num_classes=1,
                                 num_train_layers=0,
                                 hidden_size = 32,
                                 dropout=0.3,
                                 learning_rate=0.00005):
    # # Freezing BERT model: lot less room to overfit
    # if num_train_layers == 0:
    #     # Freeze all layers of pre-trained BERT model
    #     bert_model.trainable = False

    # elif num_train_layers == 12:
    #     # Train all layers of the BERT model
    #     bert_model.trainable = True

    # else:
    #     # Restrict training to the num_train_layers outer transformer layers
    #     # print("in retrain")
    #     retrain_layers = []

    #     for retrain_layer_number in range(num_train_layers):

    #         layer_code = '_' + str(11 - retrain_layer_number)
    #         retrain_layers.append(layer_code)


    #     print('retrain layers: ', retrain_layers)

    #     for w in bert_model.weights:
    #         if not any([x in w.name for x in retrain_layers]):
    #             # print('freezing: ', w)
    #             w._trainable = False

    bert_model.trainable = True

    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                   'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask}

    bert_out = bert_model(bert_inputs)

    cls_token = bert_out[0][:, 0, :]
    hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(cls_token)
    # hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer_2')(hidden)

    hidden = tf.keras.layers.Dropout(dropout)(hidden)

    classification = tf.keras.layers.Dense(num_classes, activation='relu',name='classification_layer')(hidden)

    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=[classification])

    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss = 'mean_squared_error',
              metrics = [tf.keras.metrics.RootMeanSquaredError()])

    return classification_model

In [None]:
cls_bert_model = create_bert_model(bert_model, hidden_size=256)

In [None]:
# pooler_bert_model.summary()

In [None]:
# keras.utils.plot_model(cls_bert_model, show_shapes=False, show_dtype=False, show_layer_names=True, dpi=90)

In [None]:
cls_bert_model.fit(x=[train_encodings.input_ids, train_encodings.token_type_ids, train_encodings.attention_mask],
                                                  y=np.asarray(train_labels),
                                                  validation_split=0.2,
                                                  shuffle=True,
                                                  batch_size=32,
                                                  validation_batch_size=32,
                                                  epochs=10)
# validation_data=([valid_encodings.input_ids, valid_encodings.token_type_ids, valid_encodings.attention_mask],
#                                                   np.asarray(valid_labels)),

Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7db42721b760>

In [None]:
train_predictions = cls_bert_model.predict([train_encodings.input_ids, train_encodings.token_type_ids, train_encodings.attention_mask])
### END YOUR CODE

#now we need to get the highest probability in the distribution for each prediction
#and store that in a tf.Tensor
train_predictions = tf.argmax(train_predictions, axis=-1)
print(f'Train RMSE: {np.sqrt(np.mean(np.square(train_predictions-train_labels)))}')


valid_predictions = cls_bert_model.predict([valid_encodings.input_ids, valid_encodings.token_type_ids, valid_encodings.attention_mask])
### END YOUR CODE

#now we need to get the highest probability in the distribution for each prediction
#and store that in a tf.Tensor
valid_predictions = tf.argmax(valid_predictions, axis=-1)
print(f'Valid RMSE: {np.sqrt(np.mean(np.square(valid_predictions-valid_labels)))}')

In [None]:
score = pooler_bert_model.evaluate([valid_encodings.input_ids, valid_encodings.token_type_ids, valid_encodings.attention_mask],
                                                  np.asarray(valid_labels))

print('Valid loss:', score[0])
print('Valid accuracy:', score[1])