In [1]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
#model.save_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
import pandas as pd
df_low_sample = pd.read_excel('person_low_sample.xlsx')
df_med_sample = pd.read_excel('person_med_sample.xlsx')
df_good_sample = pd.read_excel('person_good_sample.xlsx')


In [5]:
from statistics import mean 
col_list = ['interface1p', 'interface2p', 'interface3p']
def sentiForCol(col_list):
    pos_mean_ls = []
    for k in range(df_low_sample.shape[0]):
        pos_ls = []
        for j in range(0, len(col_list)):
          text = df_low_sample.iloc[k][col_list[j]]
          text = preprocess(text)
          encoded_input = tokenizer(text, return_tensors='pt')
          output = model(**encoded_input)
          scores = output[0][0].detach().numpy()
          scores = softmax(scores)
          ranking = np.argsort(scores)
          ranking = ranking[::-1]
          for i in range(scores.shape[0]):
            l = config.id2label[ranking[i]]
            s = scores[ranking[i]]
            if l=='positive':
                pos_ls.append(s)
        pos_mean = mean(pos_ls)
        pos_mean_ls.append(pos_mean)
    return pos_mean_ls
print(sentiForCol(col_list))
low = sentiForCol(col_list)

[0.3510501, 0.07537002, 0.5553407, 0.3023177, 0.4040435, 0.24080606, 0.33029565, 0.50409806, 0.46313107]


In [6]:
col_list = ['interface1p', 'interface2p', 'interface3p']
def sentiForCol1(col_list):
    pos_mean_ls = []
    for k in range(df_med_sample.shape[0]):
        pos_ls = []
        for j in range(0, len(col_list)):
          text = df_med_sample.iloc[k][col_list[j]]
          text = preprocess(text)
          encoded_input = tokenizer(text, return_tensors='pt')
          output = model(**encoded_input)
          scores = output[0][0].detach().numpy()
          scores = softmax(scores)
          ranking = np.argsort(scores)
          ranking = ranking[::-1]
          for i in range(scores.shape[0]):
            l = config.id2label[ranking[i]]
            s = scores[ranking[i]]
            if l=='positive':
                pos_ls.append(s)
        pos_mean = mean(pos_ls)
        pos_mean_ls.append(pos_mean)
    return pos_mean_ls
print(sentiForCol1(col_list))
med = sentiForCol1(col_list)

[0.34500545, 0.1851391, 0.5161521, 0.08121794, 0.023180358, 0.28468177, 0.23042737, 0.08036654, 0.032278948]


In [7]:
col_list = ['interface1p', 'interface2p', 'interface3p']
def sentiForCol1(col_list):
    pos_mean_ls = []
    for k in range(df_good_sample.shape[0]):
        pos_ls = []
        for j in range(0, len(col_list)):
          text = df_good_sample.iloc[k][col_list[j]]
          text = preprocess(text)
          encoded_input = tokenizer(text, return_tensors='pt')
          output = model(**encoded_input)
          scores = output[0][0].detach().numpy()
          scores = softmax(scores)
          ranking = np.argsort(scores)
          ranking = ranking[::-1]
          for i in range(scores.shape[0]):
            l = config.id2label[ranking[i]]
            s = scores[ranking[i]]
            if l=='positive':
                pos_ls.append(s)
        pos_mean = mean(pos_ls)
        pos_mean_ls.append(pos_mean)
    return pos_mean_ls
print(sentiForCol1(col_list))
good = sentiForCol1(col_list)

[0.3709818, 0.6109981, 0.46147192, 0.7283459, 0.3730288, 0.45493847, 0.29190907, 0.30542618, 0.61456436]


In [8]:
df_low_interface_t_test = pd.DataFrame({'experience': 'low', 'interface_pos': low })
df_med_interface_t_test = pd.DataFrame({'experience': 'med', 'interface_pos': med })
df_good_interface_t_test = pd.DataFrame({'experience': 'good', 'interface_pos': good })


In [9]:
df_experience_combine_interface_t_test = pd.concat([df_low_interface_t_test, df_med_interface_t_test, df_good_interface_t_test], axis=0)
df_experience_combine_interface_t_test

Unnamed: 0,experience,interface_pos
0,low,0.35105
1,low,0.07537
2,low,0.555341
3,low,0.302318
4,low,0.404043
5,low,0.240806
6,low,0.330296
7,low,0.504098
8,low,0.463131
0,med,0.345005


In [10]:
from scipy import stats
from itertools import combinations
hobbies = df_experience_combine_interface_t_test['experience'].unique()

# Perform pairwise t-tests
pairwise_results = []
for (hobby1, hobby2) in combinations(hobbies, 2):
    group1 = df_experience_combine_interface_t_test[df_experience_combine_interface_t_test['experience'] == hobby1]['interface_pos']
    group2 = df_experience_combine_interface_t_test[df_experience_combine_interface_t_test['experience'] == hobby2]['interface_pos']
    t_stat, p_value = stats.ttest_ind(group1, group2)
    pairwise_results.append(((hobby1, hobby2), t_stat, p_value))

# Print results
for (hobby1, hobby2), t_stat, p_value in pairwise_results:
    print(f"{hobby1} vs. {hobby2}: T-statistic = {t_stat:.2f}, P-value = {p_value:.4f}")


low vs. med: T-statistic = 2.19, P-value = 0.0436
low vs. good: T-statistic = -1.56, P-value = 0.1395
med vs. good: T-statistic = -3.61, P-value = 0.0023
