In [6]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)


In [7]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
#model.save_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
import pandas as pd
df_profile1_sample = pd.read_excel('buyer_profile1_age.xlsx')
df_profile2_sample = pd.read_excel('buyer_profile2_age.xlsx')


In [9]:
df_profile1_sample

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,UID,Name,interface1b,interface2b,interface3b,function1b,function2b,accomp1b,accomp2b
0,0,5,123,Sarah,"Yes, I like the overview. The pictures when gi...","yes, although it was not 100% clear that you a...","easy to understand, clear communication","yes, but not via Telegram/Facebook Messenger a...",Yes,"yes, definitely",yes
1,8,55,104,Nadia,"Yes, it was okay. I did not like the fact that...",Yes very useful,Not at all,Yes,"Yes, it was pretty clear","Yes, I really liked the idea of getting recomm...","Yes,however, some items did not have a very de..."
2,3,29,100,Kasra,yes it was a good overview,no,yes,no it was too much spam in the bot,yes,yes,yes
3,4,32,98,Taranpreet Kaur,"it was ok, maybe different/several angles woul...","yes, it was easy to use, but it only worked on...",it was easy to understand,"yes, didn't have any problem with it","yes, it was useful","yes, it definitely was",yes it was useful
4,2,20,152,Zuzanna,"yeah, it's like a basket in the shop","yeah, pretty easy","pretty easy, would be harder if the items look...","Yes, but once you get more requests for feedba...","it is useful, but I don't use telegram for fri...",Usually I just ask one friend for advice,"Not only the picture matters, I like to know m..."
5,1,13,87,Manuel,"Yes, gives a quick overview","Yes, it was intuitive",Very easy to understand,"Yes, after a few tries, it was easy to use","Yes, helps when ranking the items","Yes, otherwise they would not be able to recei...","Yes, cloth images are indeed helpful"
6,11,74,136,Joan,Yes,"Yes, but it would have been good to have the i...",It was easy to understand,"Yes, it would be easy if they have Telegram",Yes. A recommendation without the item informa...,"Could be useful, yes. But sharing the link to ...",Yes


In [10]:
from statistics import mean 
col_list = ['interface1b', 'interface2b', 'interface3b']
def sentiForCol(col_list):
    pos_mean_ls = []
    for k in range(df_profile1_sample.shape[0]):
        pos_ls = []
        for j in range(0, len(col_list)):
          text = df_profile1_sample.iloc[k][col_list[j]]
          text = preprocess(text)
          encoded_input = tokenizer(text, return_tensors='pt')
          output = model(**encoded_input)
          scores = output[0][0].detach().numpy()
          scores = softmax(scores)
          ranking = np.argsort(scores)
          ranking = ranking[::-1]
          for i in range(scores.shape[0]):
            l = config.id2label[ranking[i]]
            s = scores[ranking[i]]
            if l=='positive':
                pos_ls.append(s)
        pos_mean = mean(pos_ls)
        pos_mean_ls.append(pos_mean)
    return pos_mean_ls
print(sentiForCol(col_list))
profile1 = sentiForCol(col_list)

[0.54084915, 0.4337866, 0.6148639, 0.6704399, 0.37847078, 0.5336134, 0.39984038]


In [11]:
col_list = ['interface1b', 'interface2b', 'interface3b']
def sentiForCol1(col_list):
    pos_mean_ls = []
    for k in range(df_profile2_sample.shape[0]):
        pos_ls = []
        for j in range(0, len(col_list)):
          text = df_profile2_sample.iloc[k][col_list[j]]
          text = preprocess(text)
          encoded_input = tokenizer(text, return_tensors='pt')
          output = model(**encoded_input)
          scores = output[0][0].detach().numpy()
          scores = softmax(scores)
          ranking = np.argsort(scores)
          ranking = ranking[::-1]
          for i in range(scores.shape[0]):
            l = config.id2label[ranking[i]]
            s = scores[ranking[i]]
            if l=='positive':
                pos_ls.append(s)
        pos_mean = mean(pos_ls)
        pos_mean_ls.append(pos_mean)
    return pos_mean_ls
print(sentiForCol1(col_list))
profile2 = sentiForCol1(col_list)

[0.1603892, 0.3587564, 0.4918843, 0.11972574, 0.07981716, 0.32897124, 0.32894596]


In [12]:
df_profile1_interface_t_test = pd.DataFrame({'profile': 'profile1', 'interface_pos': profile1 })
df_profile2_interface_t_test = pd.DataFrame({'profile': 'profile2', 'interface_pos': profile2 })



In [13]:
df_profiles_combine_interface_t_test = pd.concat([df_profile1_interface_t_test, df_profile2_interface_t_test], axis=0)
#df_hobby_combine_interface_t_test = pd.concat([df_arts_interface_t_test, df_entertain_interface_t_test, df_study_interface_t_test, df_nature_interface_t_test], axis=0)
df_profiles_combine_interface_t_test

Unnamed: 0,profile,interface_pos
0,profile1,0.540849
1,profile1,0.433787
2,profile1,0.614864
3,profile1,0.67044
4,profile1,0.378471
5,profile1,0.533613
6,profile1,0.39984
0,profile2,0.160389
1,profile2,0.358756
2,profile2,0.491884


In [14]:
from scipy import stats
from itertools import combinations
hobbies = df_profiles_combine_interface_t_test['profile'].unique()

# Perform pairwise t-tests
pairwise_results = []
for (hobby1, hobby2) in combinations(hobbies, 2):
    group1 = df_profiles_combine_interface_t_test[df_profiles_combine_interface_t_test['profile'] == hobby1]['interface_pos']
    group2 = df_profiles_combine_interface_t_test[df_profiles_combine_interface_t_test['profile'] == hobby2]['interface_pos']
    t_stat, p_value = stats.ttest_ind(group1, group2)
    pairwise_results.append(((hobby1, hobby2), t_stat, p_value))

# Print results
for (hobby1, hobby2), t_stat, p_value in pairwise_results:
    print(f"{hobby1} vs. {hobby2}: T-statistic = {t_stat:.2f}, P-value = {p_value:.4f}")


profile1 vs. profile2: T-statistic = 3.46, P-value = 0.0048
