<a href="https://colab.research.google.com/github/bassel-94/BERT/blob/main/code/functions_postprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Post processing function for camemBERT

* get_numpies for computing numpy arrays (dimensions are n$\times$18)
* get_predictions for label predictions using a probability threshold
* get_n_label_predictions 
* undummify data
* get_sentiment_prediction
* get_random_split


In [None]:
def get_predictions(df, predictor, prob = 0.5):
  '''
  function to add a column called Labels_predicted to the dataframe df
  df must have a "comments" column
  
  inputs:
  *******
  df : data frame that contains column "comments"
  predictor : the predictor class of fast_bert
  prob : probability threshold to display a label (default 0.5)

  output:
  *******
  original data frame with an extra column called Labels_predicted
  '''

  # loop over the dataframe we want to label
  l = []
  for index, row in df.iterrows():
    
    # get predictions for each row that contains a comment
    pred = predictor.predict(row["comments"])
    
    # loop over the found labels with probabiliry "prob" and concatenate them
    a = []
    [a.append(i[0]) if i[1]>prob else None for i in pred ]
    
    # join predicted values
    Labels_predicted = ", ".join(a)
    
    # append predicted values in a list
    l.append(Labels_predicted)

  # add predicted labels to a column in the original data frame
  df["Labels_predicted"] = l

  return df

In [None]:
# function that detects if all labels have been predicted or not and returns full numpies.
def get_numpies(df_true, df_pred):
  '''
  function that takes two data frames as input and checks whether all 
  labels have been predicted or not. If there are inconsistencies with
  the dimensions, this means not all labels have been detected and we 
  need to fill the data frame with zeros for the non predicted labels.
  If not, this means that all labels have been detected and we compute numpies

  inputs:
  *******
  df_true : dataframe of the true labels in the form of dummies
  df_pred : data frame of predicted labels in the form of joined strings. The
  predicted labels should be in the last column of the dataframe.

  output:
  *******
  y_true, y_pred the predicted and the true labels in the form of dummified 
  numpy arrays. their dimensions are length of the dataframe x number of labels.
  '''
  
  # condition to know if the data frame has already been dummified (len >3) or not (len <3)
  if len(df_true.columns) <= 3:
    df_true_dummies = df_true["Labels"].str.get_dummies(sep = ", ")
    df_true = df_true.join(df_true_dummies).drop("Labels", axis = 1)

  # get true label matrix (dummified) as numpy array
  y_true = df_true.iloc[:, 1:].to_numpy()
  
  # get list of predicted labels and true labels
  list_pred = df_pred[df_pred.columns[-1]].str.get_dummies(sep=', ').columns.tolist()
  list_true = df_true.iloc[:, 1:].columns.tolist()

  # check whether there are labels that have not been predicted
  left_over = [x for x in list_true if x not in list_pred]

  # if the list if not empty (i.e. some labels are not detected)
  if left_over:

    # print the left over labels that have not been detected.
    print("The list is not empty. The following labels have not been detected:", ", ".join(left_over))

    # create data frame of the dummy labels that were not detected and fill with zeros
    df_rest = pd.DataFrame(0, columns=left_over, index=np.arange(len(df_true)))
  
    # compute again the predicted numpy array with the right dimensions and order.
    y_pred = (df_pred[df_pred.columns[-1]].str
                                .get_dummies(sep=', ')
                                .join(df_rest)
                                .sort_index(axis = 1)
                                .fillna(0)
                                .to_numpy())
  
    # make sure predicted and true labels have the same dimensions
    print("Dimension of the true dummies:", y_true.shape)
    print("Dimension of the pred dummies after adjustment:", y_pred.shape)

  # if the list is empty (i.e. all labels have been detected)
  else :

    # print message saying that all labels have been detected
    print("No need for adjustment, all labels have been detected!")

    # get predicted labels (dummified) as numpy array
    y_pred = df_pred[df_pred.columns[-1]].str.get_dummies(sep=', ').to_numpy()
      
  return y_true, y_pred

In [None]:
 # wrap it up in a function
 def get_n_labels_predictions(df_true, n_labels_list, predictor, prob=0.4):
   '''
   function that takes as input the two dataframes; true and predicted
   and returns the roc error according to the number of labels

   inputs:
   *******

   outputs:
   ********

   '''
   # add column called num_labels to get sum of number of labels per row
   df_true_sum = (df_true.iloc[:, 1:]
                  .sum(axis=1)
                  .to_frame()
                  .rename(columns={0: "num_labels"})
                  .join(df_true))
   
   # loop over the list of selected number of labels
   l = []
   for i in n_labels_list:
     
     # get new data frame that contains n labels
     df_true_sum_n = df_true_sum[df_true_sum["num_labels"]==i].iloc[:, 1:]

     # get the true labels as a list. Only select non zero columns of the true dataframe.
     labels = df_true_sum_n.iloc[:, 1:].columns.tolist()
     #list(df_true_sum_n.loc[:, df_true_sum_n.any()].columns[1:])

     # get predictions for the n labeled reviews
     df_pred_sum_n = get_predictions(df_true_sum_n[["comments"]], predictor=predictor, prob=prob)

     # get numpy arrays for the predicted and true labels
     print("for reviews that have", i, "labels:")
     y_true_sum_n, y_pred_sum_n = get_numpies(df_true = df_true_sum_n, df_pred = df_pred_sum_n)

     # print roc score
     roc_n = roc_auc_score(y_true_sum_n, y_pred_sum_n, average = "micro")
     print("ROC_AUC score :", roc_n, "\n")

     # get classification report of all labels
     cl_report = classification_report(y_true_sum_n, y_pred_sum_n, target_names = labels)

     # save all results in a list for later use
     l.append([roc_n, cl_report])
   
   return l

In [None]:
def get_count(df, display_plot = False):
  '''
  function to compute a bar plot of the label counts of the data
  frame df. the data frame should have "Count" and "Label" columns.
  '''
  
  # get dummies
  df_dummies = df.iloc[:, 1:]
  
  # get stats
  df_stats = (df_dummies
        .sum(axis=0)
        .to_frame()
        .reset_index()
        .rename(columns={"index": "Label", 0: "Count"})
        .sort_values("Count", ascending = False)
        .reset_index(drop=True))
        
  # barplot of label count
  if display_plot:

    # import plot packages
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    # set figure size
    plt.figure(figsize=(12, 7));
    g = sns.barplot(x="Count", y="Label", data=df_stats, color = "steelblue")
    
    # add annotations to the barplot
    for p in g.patches:
      width = p.get_width()
      plt.text((df_stats.Count.max()/70) + p.get_width(), p.get_y()+0.55*p.get_height(), round(width), ha='center', va='center')
      
    # add titles
    plt.xlabel("Count");
    plt.ylabel("Labels");
    plt.title("Bar plot of label count. Total num of reviews is " +str(len(df)));
    plt.xticks(fontsize=12);
    plt.yticks(fontsize=12);
    plt.tight_layout();
    plt.show();
  
  return df_stats

In [None]:
# function to get a random train test split
def get_random_split(df, test_size, seed=42):
  '''
  function to get a random train test split of a data frame according to
  test_size, seed and without repetition 
  
  inputs:
  *******
  df : dataframe to split
  test_size : the proportion of test size to take
  seed : random seed for reproducibility
  '''
  
  # train validation split 
  val_set = df.sample(frac = test_size, replace=False, random_state=seed)
  train_set = df.drop(index = val_set.index)
  
  # print message displaying number of labels and rows in each set
  print("there are", len(train_set), "rows in the train set with", len(train_set.iloc[:, 1:].columns) , "labels")
  print("there are", len(val_set), "rows in the validation set with", len(val_set.iloc[:, 1:].columns), "labels")
  
  return val_set, train_set

In [None]:
# function to reverse the dummified categorical variables in a dataframe 
def undummify(df, sep = ", "):
  '''
  function that transforms the dummified data frame back into a multilabel
  column with the same separator
  
  inputs :
  ********
  df : data frame whose first three columns are the ones we want to keep. After the 
  first three columns we consider that we only have dummified categorical variables.
  sep : The separator to use when concatenating the Labels
  
  output:
  *******
  df_melted : dataframe that should be identical to the one before applying the function
  get_dummies(). The seperator of the categorical variables is by default ', '
  '''
  
  # get data frame that contains only the dummies
  df_dummies = df.loc[:, df.columns != 'comments']
  
  # melt the dummies into a data frame with one column and a separator
  df_labels = (df_dummies
         .dot(df_dummies.columns + sep)
         .str.rstrip(sep)
         .to_frame()
         .rename(columns={0: "Labels"}))
  
  # join the comments column to the newly created Labels data frame
  df_melted = df.loc[:, df.columns == 'comments'].join(df_labels)
  
  return df_melted

In [None]:
# function to get sentiment prediction
def get_sentiment_prediction(df, model, tokenizer):
  '''
  function that computes predictions for positive and negative sentiment
  on a dataframe that contains a columns "comments".
  
  inputs:
  *******
  df: a data frame with columns called "comments"
  model: the camembert model used to make the prediction
  tokenizer: the tokenizer used for the specified model
  
  output:
  *******
  df: the same dataframe as the input with an extra column of predicted sentiment
  '''
  
  # define pipeline based on the model and tokenizer
  nlp = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
  
  # define empty list to concatenate predictions
  predictions, proba = [], []

  # iterate over datafame
  for index, row in df.iterrows():
    
    # apply nlp pipeline that contains the predict method onr comments
    pred_dict = nlp(row["comments"])[0]   # returns a dictionnary
    pred_label = pred_dict["label"]       # extracts the value of the key "label"
    pred_proba = pred_dict["score"]       # extract the value of the key "score"
    
    # append to list
    predictions.append(pred_label)
    proba.append(pred_proba)
    
  # add results to the dataframe in a new column
  df["predicted_sentiment"] = predictions
  df["predicted_probability"] = proba
  
  return(df)

In [None]:
def get_shiny_predictions(df, predictor, prob = 0.5):
  '''
  function to add a column called Labels_predicted to the dataframe df.
  It includes the probability of each label in each row of the dataframe.
  df must have a "comments" column!
  
  inputs:
  *******
  df : data frame that contains column "comments"
  predictor : the predictor class of fast_bert
  prob : probability threshold to display a label (default 0.5)

  output:
  *******
  original data frame with an extra column called Labels_predicted
  '''

  # loop over the dataframe we want to label
  l = []
  for index, row in df.iterrows():
    
    # get predictions for each row that contains a comment
    pred = predictor.predict(row["comments"])
    
    # loop over the found labels with probabiliry "prob" and concatenate them
    a = []
    [a.append(i) if i[1]>=prob else None for i in pred ]
    
    # append predicted values in a list
    l.append(a)

  # add predicted labels to a column in the original data frame
  df["Labels_predicted"] = l

  return df