
## Installations



In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 2.0 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 45.5 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 45.3 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1


## Imports

In [None]:
import pandas as pd
import numpy as np
import bs4 as bs
import torch
import requests
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

from transformers import BertTokenizer, BertModel

# Utility functions

In [None]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


## Get data from Task 2

In [None]:
# read scraped data
separator = ';' # change here, it depends on the file loaded

file_path = '/content/businessballs_data_disagg.csv' # change path here 
data = pd.read_csv(file_path,engine='python', encoding='utf-8', 
                    error_bad_lines=False, sep = separator)
data.head(10)

FileNotFoundError: ignored

# 1) Zero Shot classification
Performing zero shot classification on paragraphs to check the **relevancy** of their content with respect to the associated soft skill (from data scraped on task 1)

In [None]:
# TODO : work on scraped_data and generate a data frame with the same name for the next step

# Read the file
import pandas as pd
import torch

df_wiki = pd.read_csv('/content/scraped_data_disagg.csv') #For testing purposes 
paragraphs = df_wiki['paragraph'].values

In [None]:
# We'll use a subset to test out the computational time of model/API
paragraphs_subset = paragraphs[0:100]

In [None]:
from transformers import pipeline
import numpy as np
import heapq

# Zero shot labels
skill_labels = ['Teamwork', 'Problem-solving', 'Communication', 'Self-Awareness', 'Time management',
              'Critical thinking', 'Decision-making', 'Organizational', 'Stress management',
              'Adaptability', 'Conflict management', 'Leadership', 'Creativity',
              'Resourcefulness', 'Persuasion', 'Openness to criticism', 'Confidence',
              'Mutual respect', 'Empathy', 'Active listening', 'Constructive feedback', 'Collaboration',
              'Negotiation', 'Irrelevant']

In [None]:
# Testing using a loaded model from the Huggingface pipeline
classifier = pipeline ("zero-shot-classification", model = "facebook/bart-large-mnli")

In [None]:
# Loop over paragraphs and perform zero-shot classification
t0 = time.time()

for i in range(len(paragraphs)):
  par = paragraphs[i]

  result = classifier(par, skill_labels)
  scores = result['scores']
  output_labels = result['labels']
  index = scores.index(max(scores)) #  Or np.argmax(result['scores'], axis=0)
  
  df_wiki.loc[i,'Zero-Shot Label'] = output_labels[index]

model_time = format_time(time.time() - t0)

print("")
print("  Performing ZSC using an instantiated model took: {:}".format(model_time))

In [None]:
# Testing using the API
import requests

API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-mnli"
headers = {"Authorization": "Bearer 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"}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()

output = query({
    "inputs": "Hi, I recently bought a device from your company but it is not working as advertised and I would like to get reimbursed!",
    "parameters": {"candidate_labels": ["refund", "legal", "faq"]},
})

In [None]:
# Sneak peek at the generated labels
print(df_wiki[['Soft Skill Name', 'Zero-Shot Label']])

In [None]:
df_wiki.to_csv('scraped_data_zeroshot.csv')

In [None]:
  # In case we want to perform multi-class zero-shot classification
  
  # result = classifier(par, skill_labels, multi_label=True)['scores']
  # index = heapq.nlargest(3, range(len(result)), key=result.__getitem__)
  # print("labels : ", heapq.nlargest(3, result))
  # for j in index:
  #   zs_label = zs_label +', '+ skill_labels[j]

  # df_wiki.loc[i,'Zero-Shot Label'] = zs_label

## 2) Vectorization of the Paragraphs

In [None]:
paragraphs = data['paragraph'].values
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)

In [None]:
#Get maximum length from the paragraphs
max_len = 0

# For every sentence...
for par in paragraphs:
    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(par, add_special_tokens=True)
    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

In [None]:
bert.eval()

paragraph_vectors = []
for par in paragraphs:
    encoded_dict = tokenizer.encode_plus(
                        par,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_len,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids = encoded_dict['input_ids']
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks = encoded_dict['attention_mask']

    # Encode the sentence using BERT
    with torch.no_grad():
      outputs = bert(input_ids, attention_mask=attention_masks)
      hidden_states = outputs[2] # Get BERT's 12 output values

    # A simple approach for generating the whole paragraph embedding
    # is to average the second to last hiden layer of each token producing a single 768 length vector.
    token_vecs = hidden_states[-2][0]
    # Calculate the average of all token vectors of the current paragraph.
    paragraph_embedding = torch.mean(token_vecs, dim=0)

    paragraph_vectors.append(paragraph_embedding.tolist())

In [None]:
data['Vector'] = paragraph_vectors
data.to_csv('businessballsdata_withVectors.csv')

In [None]:
data.head()

# 3) Clustering

### a) KMeans

In [None]:
skill_labels = ['Teamwork', 'Problem-solving', 'Communication', 'Self-Awareness', 'Time management',
              'Critical thinking', 'Decision-making', 'Organizational', 'Stress management',
              'Adaptability', 'Conflict management', 'Leadership', 'Creativity',
              'Resourcefulness', 'Persuasion', 'Openness to criticism', 'Confidence',
              'Mutual respect', 'Empathy', 'Active listening', 'Constructive feedback', 'Collaboration',
              'Negotiation', 'Irrelevant']

In [None]:
# Experimenting with different values for k (based on the number of soft skills we have)
Sum_of_squared_distances = []
K = range(2,len(skill_labels))
for k in K:
    km = KMeans(n_clusters=k, max_iter=200, n_init=10)
    km = km.fit(paragraph_vectors)
    Sum_of_squared_distances.append(km.inertia_)

In [None]:
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
true_k = 6
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=200, n_init=10)
model.fit(paragraph_vectors)
labels=model.labels_

par_cl=pd.DataFrame(list(zip(paragraphs,labels)),columns=['paragraph','cluster'])
print(par_cl.sort_values(by=['cluster']))

In [None]:
data['Cluster ID'] = par_cl['cluster']
data.to_csv('wikidata_withClusters.csv')
data.head(10)

In [None]:
from sklearn.decomposition import PCA
import numpy as np

pca = PCA(n_components=2)
scatter_plot_points = pca.fit_transform(np.array(paragraph_vectors))

colors = ["r", "b", "c", "y", "g", "m"]

x_axis = [o[0] for o in scatter_plot_points]
y_axis = [o[1] for o in scatter_plot_points]
fig, ax = plt.subplots(figsize=(20,10))

ax.scatter(x_axis, y_axis, c=[colors[d] for d in labels])

for i, txt in enumerate(data['Soft Skill Name']):
    ax.annotate(txt, (x_axis[i], y_axis[i]))

To hopefully illustrate things further more, here are the clusters that we can find here: 

- Cluster 1 (Red) : (5 paragraphs referring to **collaboration**, 3 referring to **self awareness**, 1 referring to **creativity**, 1 referring to **communication**)
- Cluster 2 (Yellow) : (3 paragraphs referring to **creativity**, 2 referring to **communication**, 1 referring to **self awareness**)
- Cluster 3 (Cyan) : (4 paragraphs referring to **communication**, 1 referring to **creativity**)
- Cluster 4 (Purple) : 2 paragraphs referring to **self-awareness**
- Cluster 5 (Green) : 2 paragraphs referring to **self-awareness** 
(Notice that paragraphs in cluster 4 and 5 all talk about self awareness, but they've been clustered differently according to their actual/more detailed content)
- Cluster 6 (Blue) : (1 paragraph referring to **creativity**, 1 referring to **communication**)

# 3) Content summarization
Summarize each paragraph to facilitate assigning human-readable labels to them

**NOTE : the output of this task will ONLY be used for manual labeling, and will not be provided to further tasks.**

In [None]:
data = pd.read_csv('/content/scraped_data_clustering.csv') # to be commented if we we want to use data from the previous step (have to run all steps before !)

In [None]:
paragraphs = list(data['paragraph'].values) # to be commented if we want to the one defined in previous steps

In [None]:
clusters = list(data['cluser'].unique())
clusters

In [None]:
API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-cnn"
headers = {"Authorization": "Bearer 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"}

# Better to use the API
def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()

In [None]:
def summarize_paragraphs(pars): # pars : list of paragraphs
  n = len(pars)

  if n < 1 :
    return ''

  if n== 1 :
    return query(par[0])[0]['summary_text']

  if n == 2 :
    summary_1 = query(pars[0])[0]['summary_text']
    summary_2 = query(pars[1])[0]['summary_text']
    return query(summary_1 + '\n' + summary_2)[0]['summary_text']
  else:
    summary_1 = summarize_paragraphs(paragraphs[0:(n//2)])
    summary_2  = summarize_paragraphs(paragraphs[(n//2):n])
    return query(summary_1 + '\n' + summary_2)[0]['summary_text']


In [None]:
# test the function
summarize_paragraphs(paragraphs[0:40])

In [None]:
# Summarize paragraphs per cluster until reaching a size manageable enough for manual labeling
summaries = []

#The code currently summarizes all paragraphs at once,
# To be modified in order to group them by cluster ID
for par in paragraphs[0:1]:
  input = {
      'inputs': par
  }
  summary = query(input)[0]['summary_text']
  summaries.append(summary)

In [None]:
data['Paragraph summary'] = summaries

In [None]:
data[['Paragraph', 'Paragraph summary']].head(10)

In [None]:
data.to_csv('wikidata_withSummaries')

# 4) Assign labels for clusters
Manually assign the human-readable labels to clusters (multiple paragraphs are assigned to a certain cluster, the objective is to label this cluster as a whole)

# 5) Zero-shot Classification
Perform another round of zero-shot classification on the manually annotated dataset from step n° 2 using the human-readable labels

In [None]:
!pip install transformers

In [None]:
from transformers import pipeline

In [None]:
classifier = pipeline("zero-shot-classification",
                      model="oigele/Fb_improved_zeroshot")


In [None]:
################### dummy code to check if multi-lables work##################
sequence_to_classify = "natural language processing"
fine_labels = []
classifier(sequence_to_classify, candidate_labels, multi_label=True)

# 6) Content Aggregation
Aggregate (assemble) paragraphs from the same cluster 

In [None]:
########## Below code is to check if the alogrithm works. Using dummy data for now.#############
import pandas as pd

In [None]:
columns = ["labels", "agg_sum_content"]
rows = [[["1"], "Some content for label 1"],
        [["1", "2"], "Some content for label 1 and 2"],
        [["3", "4"], "Some content for label 3 and 4"],
        [["4"], "Some content for label 4"],
        [["1", "5", "2"], "Some content for label 1, 2 and 5"],
        [["6"], "Some content for label 6"],]
dummy_df = pd.DataFrame(rows, columns = columns)
dummy_df

In [None]:
def aggregate_content_based_on_labels(df):
  labels = df['labels'].explode().unique()
  new_data = {}

  for index, row in df.iterrows():
    curr_labels = row["labels"]
    for label in curr_labels:
      if label not in new_data:
        new_data[label] = []
      new_data[label].append(row["agg_sum_content"])
      new_data[label] = ['. '.join(new_data[label])]
  
  print(new_data)
  new_df = pd.DataFrame.from_dict(new_data, orient='index',
                       columns=['agg_content'])
  new_df
  return new_df
new_df = aggregate_content_based_on_labels(dummy_df)
new_df

# 7) Summarization

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration


In [None]:
#summarizer = pipeline('summarization', model="facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

In [None]:
def abstractive_summarize_content(content):
    inputs = tokenizer(content, max_length=1024, return_tensors="pt")
    summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=0, max_length=30)
    return tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

new_df['summary'] = new_df['agg_content'].apply(abstractive_summarize_content)  

In [None]:
type(new_df["agg_content"])

In [None]:
new_df

Summarization -  extractive model with frequency driven approach

In [None]:
#sentence_list = nltk.sent_tokenize(article_text)

import nltk
import pandas as pd
import heapq
import re
nltk.download('stopwords')
nltk.download('punkt')


max_words_per_sentance=15
max_sentances_in_summary=2


def extractive_summarize_content(article_text):
  
  formatted_article_text = re.sub('[^a-zA-Z]', ' ', article_text )
  formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text)
  sentence_list = nltk.sent_tokenize(article_text)
  stopwords = nltk.corpus.stopwords.words('english')
  word_frequencies = {}
  for word in nltk.word_tokenize(formatted_article_text):
    if word not in stopwords:
        if word not in word_frequencies.keys():
            word_frequencies[word] = 1
        else:
            word_frequencies[word] += 1
  maximum_frequncy = max(word_frequencies.values())

 
  for word in word_frequencies.keys():
    word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)
  sentence_scores = {}
  for sent in sentence_list:
    for word in nltk.word_tokenize(sent.lower()):
        if word in word_frequencies.keys():
            if len(sent.split(' ')) < max_words_per_sentance:
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = word_frequencies[word]
                else:
                    sentence_scores[sent] += word_frequencies[word]
  summary_sentences = heapq.nlargest(max_sentances_in_summary, sentence_scores, key=sentence_scores.get)
  summary = ' '.join(summary_sentences)
  
  return summary
  


The below function can be used to call the actuall summary function based upon a string input, can make this better

In [None]:
def summary(type):
  if "extractive" in type:
    return extractive_summarize_content
 # if "abstractive" in type:
  #  return abstractive_summarize_content
  return abstractive_summarize_content




The below part is to run the extractive summarization function and display the output. Could be removed later

In [None]:
data = ["India is a great country. India's capital is New Delhi. It's prime minister is Narendra Modi. He is from Gujarat.","Virat Kohli is one of the most succesful cricketers of all time. He has scored most hundereds among current players. Despite a bad patch he is still scoring runs and broke his century drought by scoreing a 100 against Afghanistan.","I am a software engineer with a unicorn startup. Before that I was in Consultancy and Analytics. I didnt go for an MBA as I was not ready for a commitment. So I am upskilling and looking for other oppurtunities."]
  

df = pd.DataFrame(data, columns=['para'])

df['summary'] = df['para'].apply(summary("extractive")) 

df


# 8) Ranking of Summaries

# 9) Saving the data to a database