In [33]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
import torch


In [34]:
data  = pd.read_csv('../data/slack_data.csv')

In [35]:
data.columns

Index(['msg_id', 'text', 'cleaned_text', 'user_id', 'mentions', 'reactions',
       'replies', 'ts', 'channel_id'],
      dtype='object')

In [36]:
data.head()

Unnamed: 0,msg_id,text,cleaned_text,user_id,mentions,reactions,replies,ts,channel_id
0,16f68d4e-0ceb-448a-b660-d5ef2eb05305,*HOTSEAT ANNOUNCEMENT*,hotseat announc,U03V1AM5TFA,[],[],[],1662621000.0,C03T0APHX63
1,7c641275-2e52-4074-9894-744f049d5377,*<!here>* Good morning Community! We are very ...,good morn commun happi excit announc today hot...,U03V1AM5TFA,['U03U1GHT39V'],"[{'name': 'fire', 'users': ['U03U9FWPNCE'], 'c...",[],1662621000.0,C03T0APHX63
2,245ecc4d-2c1b-4bee-b280-a1fd5ab7fee3,*<!here> Community Building Session REMINDER!*...,commun build session remindertimerclock plea n...,U03V1AM5TFA,[],"[{'name': 'heart_eyes', 'users': ['U03UG4Q7V42...",[],1662638000.0,C03T0APHX63
3,fe80aff2-20f2-42ad-94a8-8b48ac63083f,Sweet music on Google meet now\n:point_right: ...,sweet music googl meet pointright meetgoogleco...,U03V1AM5TFA,[],[],[],1662638000.0,C03T0APHX63
4,2be29318-9c50-4b56-ae0b-ae8bcd4c92a3,Hellooo Helllo again my people the lovely com...,hellooo helllo peopl love commun guy ……it cb t...,U03V1AM5TFA,[],[],[],1662638000.0,C03T0APHX63


In [38]:
data.describe()

Unnamed: 0,ts
count,18944.0
mean,1664502000.0
std,2081142.0
min,1661014000.0
25%,1662553000.0
50%,1664540000.0
75%,1666002000.0
max,1674215000.0


In [51]:
import pandas as pd
from transformers import AlbertTokenizer, AlbertForSequenceClassification
import torch
from torch.nn.functional import softmax

# Load pre-trained ALBERT model and tokenizer
model_name = "albert-base-v2"
tokenizer = AlbertTokenizer.from_pretrained(model_name)
model = AlbertForSequenceClassification.from_pretrained(model_name)

subset_df = data.tail(10)

# Assuming your DataFrame is named 'df' and has the column 'cleaned_text'

# Define prompts for classification
prompts = {
    'Question Technical': "Identify if the following is a technical question related to a specific topic: ",
    'Question Non-Technical': "Identify if the following is a non-technical question: ",
    'Comment Technical': "Determine if the following is a technical comment discussing a specific topic: ",
    'Comment Non-Technical': "Determine if the following is a non-technical comment: ",
    'Answer': "Classify the following as an answer to a question: ",
    'Other': "Categorize the following as miscellaneous or not falling into the specified categories: "
}

# Tokenize and encode the cleaned_text with the prompts
tokenized_texts = [tokenizer.encode(prompts.get(label, "") + text, add_special_tokens=True) for label, text in zip(subset_df['text'], subset_df['text'])]

# Pad sequences to the same length
max_len = max(map(len, tokenized_texts))
padded_texts = [text + [0] * (max_len - len(text)) for text in tokenized_texts]

# Convert lists to tensors
input_ids = torch.tensor(padded_texts)

# Predict probabilities for each class
with torch.no_grad():
    outputs = model(input_ids)
    probs = torch.softmax(outputs.logits, dim=1).numpy()

# Define label mappings
labels_mapping = {
    0: 'Question Technical',
    1: 'Question Non-Technical',
    2: 'Comment Technical',
    3: 'Comment Non-Technical',
    4: 'Answer',
    5: 'Other'
}

# Assign labels based on probabilities
subset_df['predicted_label'] = [labels_mapping[label] for label in torch.argmax(outputs.logits, dim=1).numpy()]

print(subset_df[['text', 'predicted_label']])


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


                                                    text  \
18934                                        ETL and ELT   
18935                                     Finally :grin:   
18936                The beginning of the End:joy: :joy:   
18937                                      12 of 12:100:   
18938  :timer_clock:*REMINDER*:timer_clock: *REMINDER...   
18939  <@U03TEPYRM2P> so, do we unsubmit  the assignm...   
18940  Please use the submission link for week 12 to ...   
18941                                     oh, ok thanks!   
18942  How can I get the trainees that opted-in a giv...   
18943  <https://developer.algorand.org/solutions/mint...   

              predicted_label  
18934  Question Non-Technical  
18935  Question Non-Technical  
18936  Question Non-Technical  
18937  Question Non-Technical  
18938  Question Non-Technical  
18939  Question Non-Technical  
18940  Question Non-Technical  
18941  Question Non-Technical  
18942  Question Non-Technical  
18943  Question Non

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_df['predicted_label'] = [labels_mapping[label] for label in torch.argmax(outputs.logits, dim=1).numpy()]


In [25]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch
from torch.nn.functional import softmax

# Load pre-trained RoBERTa model and tokenizer
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name)

# Assuming your DataFrame is named 'df' and has the column 'cleaned_text'

# Select only the first 10 rows from the DataFrame
subset_df = data.head(10)

# Define a prompt for classification (customize it based on your task)
prompt_template = "Classify the following text: {}"

# Tokenize and encode the cleaned_text with the prompt
tokenized_texts = [tokenizer.encode(prompt_template.format(text), add_special_tokens=True) for text in subset_df['text']]

# Pad sequences to the same length
max_len = max(map(len, tokenized_texts))
padded_texts = [text + [0] * (max_len - len(text)) for text in tokenized_texts]

# Convert lists to tensors
input_ids = torch.tensor(padded_texts)

# Predict probabilities for each class
with torch.no_grad():
    outputs = model(input_ids)
    probs = torch.softmax(outputs.logits, dim=1).numpy()

# Define label mappings
labels_mapping = {
    2: 'Comment Technical',
    4: 'Answer',
    0: 'Question Technical',
    1: 'Question Non-Technical',
    3: 'Comment Non-Technical',
    5: 'Other'
}

# Assign labels based on probabilities
subset_df['predicted_label'] = [labels_mapping[label] for label in torch.argmax(outputs.logits, dim=1).numpy()]

# Display the result
print(subset_df[['text', 'predicted_label']])


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


                                                text         predicted_label
0                             *HOTSEAT ANNOUNCEMENT*  Question Non-Technical
1  *<!here>* Good morning Community! We are very ...  Question Non-Technical
2  *<!here> Community Building Session REMINDER!*...  Question Non-Technical
3  Sweet music on Google meet now\n:point_right: ...  Question Non-Technical
4  Hellooo Helllo again my people the lovely  com...  Question Non-Technical
5                                                yes      Question Technical
6                                                yes      Question Technical
7                                                Yes      Question Technical
8                                                Yes      Question Technical
9                                                Yep      Question Technical


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_df['predicted_label'] = [labels_mapping[label] for label in torch.argmax(outputs.logits, dim=1).numpy()]


In [46]:
import pandas as pd
from transformers import AlbertTokenizer, AlbertForSequenceClassification
import torch
from torch.nn.functional import softmax

# Load pre-trained ALBERT model and tokenizer
model_name = "albert-base-v2"
tokenizer = AlbertTokenizer.from_pretrained(model_name)
model = AlbertForSequenceClassification.from_pretrained(model_name)

subset_df = data.tail(10)

# Assuming your DataFrame is named 'df' and has the column 'cleaned_text'

# Define a new prompt for classification
prompt_template = "Evaluate the text: {} Class labels: Question Technical (0), Question Non-Technical (1), Comment Technical (2), Comment Non-Technical (3), Answer (4), Other (5)"

# Tokenize and encode the cleaned_text with the new prompt
tokenized_texts = [tokenizer.encode(prompt_template.format(text), add_special_tokens=True) for text in subset_df['text']]

# Pad sequences to the same length
max_len = max(map(len, tokenized_texts))
padded_texts = [text + [0] * (max_len - len(text)) for text in tokenized_texts]

# Convert lists to tensors
input_ids = torch.tensor(padded_texts)

# Predict probabilities for each class
with torch.no_grad():
    outputs = model(input_ids)
    probs = torch.softmax(outputs.logits, dim=1).numpy()

# Define label mappings
labels_mapping = {
    0: 'Question Technical',
    1: 'Question Non-Technical',
    2: 'Comment Technical',
    3: 'Comment Non-Technical',
    4: 'Answer',
    5: 'Other'
}

# Assign labels based on probabilities
subset_df['predicted_label'] = [labels_mapping[label] for label in torch.argmax(outputs.logits, dim=1).numpy()]

print(subset_df[['text', 'predicted_label']])


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


                                                    text     predicted_label
18934                                        ETL and ELT  Question Technical
18935                                     Finally :grin:  Question Technical
18936                The beginning of the End:joy: :joy:  Question Technical
18937                                      12 of 12:100:  Question Technical
18938  :timer_clock:*REMINDER*:timer_clock: *REMINDER...  Question Technical
18939  <@U03TEPYRM2P> so, do we unsubmit  the assignm...  Question Technical
18940  Please use the submission link for week 12 to ...  Question Technical
18941                                     oh, ok thanks!  Question Technical
18942  How can I get the trainees that opted-in a giv...  Question Technical
18943  <https://developer.algorand.org/solutions/mint...  Question Technical


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_df['predicted_label'] = [labels_mapping[label] for label in torch.argmax(outputs.logits, dim=1).numpy()]


In [31]:
import pandas as pd
from transformers import XLNetTokenizer, XLNetForSequenceClassification
import torch
from torch.nn.functional import softmax

# Load pre-trained XLNet model and tokenizer
model_name = "xlnet-base-cased"
tokenizer = XLNetTokenizer.from_pretrained(model_name)
model = XLNetForSequenceClassification.from_pretrained(model_name)

# Assuming your DataFrame is named 'df' and has the column 'cleaned_text'
subset_df = data.tail(10)

# Define a prompt for classification (customize it based on your task)
prompt_template = "Classify the following text: {} Class labels: Question (0), Answer (1), Comment (2), Other (3)"

# Tokenize and encode the cleaned_text with the prompt
tokenized_texts = [tokenizer.encode(prompt_template.format(text), add_special_tokens=True) for text in subset_df['cleaned_text']]

# Pad sequences to the same length
max_len = max(map(len, tokenized_texts))
padded_texts = [text + [0] * (max_len - len(text)) for text in tokenized_texts]

# Convert lists to tensors
input_ids = torch.tensor(padded_texts)

# Predict probabilities for each class
with torch.no_grad():
    outputs = model(input_ids)
    probs = torch.softmax(outputs.logits, dim=1).numpy()

# Define label mappings
labels_mapping = {
    0: 'Question',
    1: 'Answer',
    2: 'Comment',
    3: 'Other'
}

# Assign labels based on probabilities
subset_df['predicted_label'] = [labels_mapping[label] for label in torch.argmax(outputs.logits, dim=1).numpy()]

print(subset_df[['cleaned_text', 'predicted_label']])


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


                                            cleaned_text predicted_label
18934                                            etl elt        Question
18935                                         final grin        Question
18936                                   begin endjoy joy        Question
18937                                                NaN        Question
18938  timerclockremindertimerclock remind timerclock...          Answer
18939  unsubmit assign submit suppos submit new assig...        Question
18940                  plea use submiss link week submit        Question
18941                                        oh ok thank        Question
18942  get traine optedin given asset print statu ass...          Answer
18943                                                NaN        Question


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_df['predicted_label'] = [labels_mapping[label] for label in torch.argmax(outputs.logits, dim=1).numpy()]
