# **Project Description:Customer Feedback Sentiment Predictor**
 
# **Data Description**:

- A sentiment analysis job about the customer feedback
- Feedback talks about different IT Services, Infrastructure etc.

# **Dataset**:

- Contains two columns "review" & "label"
    - review : Customer Feedback about the Product and the Service
    - label : '1' for Negative and '0' for Positive

# **Objective**:

- To fine tune existing DistilBERT Model

# **Steps Applied**:

- Data Transformation
- Configuring Modeling Parameters
- Persisting Model & Tokenizer

In [2]:
!pip install datasets transformers huggingface_hub

from datasets import load_dataset, load_metric
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

from sklearn.model_selection import train_test_split

import numpy as np

Collecting datasets
  Downloading datasets-2.2.2-py3-none-any.whl (346 kB)
     |████████████████████████████████| 346 kB 2.2 MB/s            
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting dill<0.3.5
  Downloading dill-0.3.4-py2.py3-none-any.whl (86 kB)
     |████████████████████████████████| 86 kB 9.0 MB/s             
[?25hCollecting xxhash
  Downloading xxhash-3.0.0-cp38-cp38-macosx_11_0_arm64.whl (30 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.13-py38-none-any.whl (131 kB)
     |████████████████████████████████| 131 kB 3.6 MB/s            
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.5.0-py3-none-any.whl (140 kB)
     |████████████████████████████████| 140 kB 6.0 MB/s            
Collecting pyarrow>=6.0.0
  Downloading pyarrow-8.0.0-cp38-cp38-macosx_11_0_arm64.whl (16.2 MB)
     |████████████████████████████████| 16.2 MB 3.2 MB/s            
Collecting multiprocess
  Downloading multiprocess-0.70.12.2-p

In [225]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

loading configuration file https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english/resolve/main/config.json from cache at /Users/anchitsaxena/.cache/huggingface/transformers/4e60bb8efad3d4b7dc9969bf204947c185166a0a3cf37ddb6f481a876a3777b5.9f8326d0b7697c7fd57366cdde57032f46bc10e37ae81cb7eb564d66d23ec96b
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased-finetuned-sst-2-english",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "finetuning_task": "sst-2",
  "hidden_dim": 3072,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "initializer_range": 0.02,
  "label2id": {
    "NEGATIVE": 0,
    "POSITIVE": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,


In [151]:
def read_analyze_data(datafile):
    """
    Read the Data File and Log the Dimension
    Count and Drop Null Values
    
    datafile : Source Data File
    data : Returns non-null df
    """
    data = pd.read_csv(datafile)
    logging.info(str(data.shape))
    logging.info(str(data.isnull().sum(axis=0)))
    data.dropna(inplace=True)
    return data

In [152]:
data = read_analyze_data('review_data.csv')

In [153]:
def feature_engg(data):
    X = data['review']
    y = data['label']
    """
    Replace "0" with Negatives and "1" with Positives
    because DistilBERT has been trained in such a fashion
    """
    y = y.replace(1,2)
    y = y.replace(0,1)
    y = y.replace(2,0)
    return X,y

In [218]:
def split_data(X, y, test_ratio):
    """
    Split the Dataset into Train,Test
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, \
                test_size=test_ratio, random_state=42, stratify=y)
    return X_train, X_test, y_train, y_test

In [220]:
def transform_to_dataset(X_train, X_test, y_train, y_test):
    """
    Convert data to Dataset to exhibit Fine-Tuning.
    This also includes renaming 'review' to 'text'
    """
    small_train_dataset = pd.DataFrame()
    small_test_dataset = pd.DataFrame()
    small_train_dataset['text'] = X_train
    small_train_dataset['label'] = y_train
    small_test_dataset['text'] = X_test
    small_test_dataset['label'] = y_test
    train_dataset = Dataset.from_pandas(small_train_dataset)
    test_dataset = Dataset.from_pandas(small_test_dataset)
    return train_dataset, test_dataset
    

In [222]:
def remove_unwanted_key(train_dataset, test_dataset):
    """
    Remove irrelevant keys from the Dataset
    """
    train_dataset = train_dataset.remove_columns('__index_level_0__')
    test_dataset = test_dataset.remove_columns('__index_level_0__')
    return train_dataset, test_dataset
    

In [223]:
X, y = feature_engg(data)
X_train, X_test, y_train, y_test = split_data(X, y, 0.3)
train_dataset, test_dataset = transform_to_dataset(X_train, X_test, y_train, y_test)

In [236]:
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True)
 
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

### MODELING PARAMETERS INLINE FOR FINE-TUNING

In [229]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [237]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english", num_labels=2)

loading configuration file https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english/resolve/main/config.json from cache at /Users/anchitsaxena/.cache/huggingface/transformers/4e60bb8efad3d4b7dc9969bf204947c185166a0a3cf37ddb6f481a876a3777b5.9f8326d0b7697c7fd57366cdde57032f46bc10e37ae81cb7eb564d66d23ec96b
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased-finetuned-sst-2-english",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "finetuning_task": "sst-2",
  "hidden_dim": 3072,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "initializer_range": 0.02,
  "label2id": {
    "NEGATIVE": 0,
    "POSITIVE": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,


In [238]:
def compute_metrics(eval_pred):
    """
    Compute the Metrics on Test Dataset 
    """
    load_accuracy = load_metric("accuracy")
    load_f1 = load_metric("f1")
  
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    return {"accuracy": accuracy, "f1": f1}

### Modeling Configurations

In [240]:
repo_name = "finetuning-sentiment-model-2600-samples-feedback-data"
 
training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch",
   push_to_hub=False,
)
 
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [241]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 2676
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 336


Step,Training Loss


Saving model checkpoint to finetuning-sentiment-model-2600-samples-feedback-data/checkpoint-168
Configuration saved in finetuning-sentiment-model-2600-samples-feedback-data/checkpoint-168/config.json
Model weights saved in finetuning-sentiment-model-2600-samples-feedback-data/checkpoint-168/pytorch_model.bin
tokenizer config file saved in finetuning-sentiment-model-2600-samples-feedback-data/checkpoint-168/tokenizer_config.json
Special tokens file saved in finetuning-sentiment-model-2600-samples-feedback-data/checkpoint-168/special_tokens_map.json
Saving model checkpoint to finetuning-sentiment-model-2600-samples-feedback-data/checkpoint-336
Configuration saved in finetuning-sentiment-model-2600-samples-feedback-data/checkpoint-336/config.json
Model weights saved in finetuning-sentiment-model-2600-samples-feedback-data/checkpoint-336/pytorch_model.bin
tokenizer config file saved in finetuning-sentiment-model-2600-samples-feedback-data/checkpoint-336/tokenizer_config.json
Special tokens

TrainOutput(global_step=336, training_loss=0.09435498146783738, metrics={'train_runtime': 8751.1256, 'train_samples_per_second': 0.612, 'train_steps_per_second': 0.038, 'total_flos': 651187998860304.0, 'train_loss': 0.09435498146783738, 'epoch': 2.0})

In [242]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 1148
  Batch size = 16


{'eval_loss': 0.08026587963104248,
 'eval_accuracy': 0.980836236933798,
 'eval_f1': 0.9902395740905058,
 'eval_runtime': 340.9164,
 'eval_samples_per_second': 3.367,
 'eval_steps_per_second': 0.211,
 'epoch': 2.0}

In [244]:
tokenizer.save_pretrained('./hf_tokenizer_fine_tuned_v1_feedback')

tokenizer config file saved in ./hf_tokenizer_fine_tuned_v1_feedback/tokenizer_config.json
Special tokens file saved in ./hf_tokenizer_fine_tuned_v1_feedback/special_tokens_map.json


('./hf_tokenizer_fine_tuned_v1_feedback/tokenizer_config.json',
 './hf_tokenizer_fine_tuned_v1_feedback/special_tokens_map.json',
 './hf_tokenizer_fine_tuned_v1_feedback/vocab.txt',
 './hf_tokenizer_fine_tuned_v1_feedback/added_tokens.json',
 './hf_tokenizer_fine_tuned_v1_feedback/tokenizer.json')

In [245]:
model.save_pretrained('./hf_fine_tuned_v1_feedback/')

Configuration saved in ./hf_fine_tuned_v1_feedback/config.json
Model weights saved in ./hf_fine_tuned_v1_feedback/pytorch_model.bin
