# Social Chemistry 101 dataset

In [15]:
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
import os
import sys

tqdm.pandas()

#### Setup paths

In [18]:
load_dotenv()


parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(parent_dir)


datasets_dir=os.getenv("DATASETS_DIR")
datasets_dir_path=os.path.join(parent_dir, datasets_dir)
sc_101_dir = os.path.join(datasets_dir_path, "social-chem-101")

### Load dataset

In [2]:
%%bash
cd ../data
wget https://storage.googleapis.com/ai2-mosaic-public/projects/social-chemistry/data/social-chem-101.zip
unzip social-chem-101.zip
rm social-chem-101.zip

--2025-04-24 15:44:45--  https://storage.googleapis.com/ai2-mosaic-public/projects/social-chemistry/data/social-chem-101.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.16.59, 142.250.203.155, 142.250.186.219, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.16.59|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 27610699 (26M) [application/zip]
Saving to: ‘social-chem-101.zip’

     0K .......... .......... .......... .......... ..........  0%  398K 68s
    50K .......... .......... .......... .......... ..........  0%  407K 67s
   100K .......... .......... .......... .......... ..........  0% 1.17M 52s
   150K .......... .......... .......... .......... ..........  0%  673K 49s
   200K .......... .......... .......... .......... ..........  0% 1.56M 42s
   250K .......... .......... .......... .......... ..........  1% 2.13M 37s
   300K .......... .......... .......... .......... ..........  1% 2.12M 34s
   35

Archive:  social-chem-101.zip
   creating: /Users/akravche/Projects/UCU/alignment/data/social-chem-101
  inflating: __MACOSX/._social-chem-101  
  inflating: social-chem-101/social-chem-101.v1.0.tsv  
  inflating: __MACOSX/social-chem-101/._social-chem-101.v1.0.tsv  
  inflating: social-chem-101/README.v1.0.md  


In [3]:
df = pd.read_csv('../data/social-chem-101/social-chem-101.v1.0.tsv', sep='\t')

In [4]:
df.groupby('split')['action'].apply(lambda x: x.str.len().sum()).reset_index()

Unnamed: 0,split,action
0,analysis,860541.0
1,dev,1218875.0
2,dev-extra,835531.0
3,none,170205.0
4,test,1215758.0
5,test-extra,840941.0
6,train,9722133.0


### Get test split

In [6]:
test_df = df[df['split'] == 'test']

In [7]:
df.head()

Unnamed: 0,area,m,split,rot-agree,rot-categorization,rot-moral-foundations,rot-char-targeting,rot-bad,rot-judgment,action,...,action-char-involved,action-hypothetical,situation,situation-short-id,rot,rot-id,rot-worker-id,breakdown-worker-id,n-characters,characters
0,amitheasshole,1,train,4.0,advice,loyalty-betrayal,char-1,0,it's bad,doing something that causes other people to lo...,...,char-1,hypothetical,losing trust in my friend,reddit/amitheasshole/aypvmz,It's bad to do something that causes other peo...,rot/reddit/amitheasshole/aypvmz/3K5TEWLKGYQFYA...,127,0,2,narrator|my friend
1,amitheasshole,1,dev,3.0,social-norms,loyalty-betrayal,char-0,0,expected,people participating in the big events in thei...,...,char-0,explicit-no,saying no to being a bridesmaid at a friend's ...,reddit/amitheasshole/9tzn0z,People are expected to participate in the big ...,rot/reddit/amitheasshole/9tzn0z/3EG49X351XRR9F...,89,39,3,narrator|a bridesmaid|a friend
2,amitheasshole,1,test,3.0,social-norms,care-harm|loyalty-betrayal,char-1,0,Partners should,Listening to each other's issues.,...,char-1,probable,telling my boyfriend I am bored and unhappy at...,reddit/amitheasshole/a1311q,Partners should listen to each other's issues.,rot/reddit/amitheasshole/a1311q/3JV9LGBJWWT6CZ...,111,145,2,narrator|my boyfriend
3,amitheasshole,1,dev,2.0,advice,loyalty-betrayal,char-0,0,it's okay,needing space from family.,...,char-0,probable,not wanting to be around my family,reddit/amitheasshole/akkcpn,It is okay to need space from family.,rot/reddit/amitheasshole/akkcpn/3R2PKQ87NZNW8N...,30,0,2,narrator|my family
4,amitheasshole,1,train,4.0,advice,care-harm,char-0,0,it's good,keeping things clean.,...,char-0,explicit,washing my cat's bowls in the kitchen sink,reddit/amitheasshole/aof4ml,It's good to keep things clean.,rot/reddit/amitheasshole/aof4ml/3HMIGG0U4OL3DY...,42,49,1,narrator


In [8]:
df_subset=test_df[["area", "rot-agree", "rot-categorization", "rot-moral-foundations", "rot-judgment", "action", "action-moral-judgment", "action-legal"]]
df_subset = df_subset[df_subset['action'].notna()]
df_subset = df_subset[df_subset['action-moral-judgment'].notna()]
df_subset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25488 entries, 2 to 295641
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   area                   25488 non-null  object 
 1   rot-agree              25488 non-null  float64
 2   rot-categorization     25369 non-null  object 
 3   rot-moral-foundations  20647 non-null  object 
 4   rot-judgment           25488 non-null  object 
 5   action                 25488 non-null  object 
 6   action-moral-judgment  25488 non-null  float64
 7   action-legal           25428 non-null  object 
dtypes: float64(2), object(6)
memory usage: 1.8+ MB


### Deduplication

In [9]:
df_duplicates = df_subset[df_subset['action'].duplicated(keep=False)]

### Filtration by highest Rot-agree

In [10]:
df_cleaned = df_subset.loc[df_subset.groupby('action')['rot-agree'].idxmax()]
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23887 entries, 233030 to 40143
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   area                   23887 non-null  object 
 1   rot-agree              23887 non-null  float64
 2   rot-categorization     23775 non-null  object 
 3   rot-moral-foundations  19316 non-null  object 
 4   rot-judgment           23887 non-null  object 
 5   action                 23887 non-null  object 
 6   action-moral-judgment  23887 non-null  float64
 7   action-legal           23829 non-null  object 
dtypes: float64(2), object(6)
memory usage: 1.6+ MB


### Map labels to three categories

In [11]:
mapping = {-2: 0, -1: 0, 0: 1, 1: 2, 2: 2}
df_cleaned['label'] = df_cleaned['action-moral-judgment'].map(mapping)

### Leave only entries where 4 reviewers agree on labelling

In [12]:
df_cleaned=df_cleaned[df_cleaned['rot-agree']>=4]
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7103 entries, 12964 to 40143
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   area                   7103 non-null   object 
 1   rot-agree              7103 non-null   float64
 2   rot-categorization     7087 non-null   object 
 3   rot-moral-foundations  6078 non-null   object 
 4   rot-judgment           7103 non-null   object 
 5   action                 7103 non-null   object 
 6   action-moral-judgment  7103 non-null   float64
 7   action-legal           7078 non-null   object 
 8   label                  7103 non-null   int64  
dtypes: float64(2), int64(1), object(6)
memory usage: 554.9+ KB


### Leave only one moral foundation

In [13]:
df_cleaned = df_cleaned[df_cleaned['rot-moral-foundations'].str.contains('care-harm', na=False)]
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3682 entries, 12964 to 40143
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   area                   3682 non-null   object 
 1   rot-agree              3682 non-null   float64
 2   rot-categorization     3677 non-null   object 
 3   rot-moral-foundations  3682 non-null   object 
 4   rot-judgment           3682 non-null   object 
 5   action                 3682 non-null   object 
 6   action-moral-judgment  3682 non-null   float64
 7   action-legal           3670 non-null   object 
 8   label                  3682 non-null   int64  
dtypes: float64(2), int64(1), object(6)
memory usage: 287.7+ KB


In [19]:
df_cleaned.to_csv(os.path.join(sc_101_dir, "social-chem-101.csv"), index=False)

### EDA

#### By source

In [23]:
df_cleaned.groupby('area').size().reset_index(name='count').sort_values(by='count', ascending=False)

Unnamed: 0,area,count
0,amitheasshole,1179
1,confessions,1110
3,rocstories,971
2,dearabby,422


#### By Rot category

In [24]:
df_cleaned.groupby('rot-categorization').size().reset_index(name='count').sort_values(by='count', ascending=False)

Unnamed: 0,rot-categorization,count
3,morality-ethics,1018
0,advice,903
9,social-norms,558
6,morality-ethics|social-norms,439
2,description,278
10,social-norms|advice,208
4,morality-ethics|advice,150
11,social-norms|description,58
1,advice|description,33
5,morality-ethics|description,25


### By RoT moral foundation

In [25]:
df_cleaned.groupby('rot-moral-foundations').size().reset_index(name='count').sort_values(by='count', ascending=False)

Unnamed: 0,rot-moral-foundations,count
0,care-harm,2561
6,care-harm|loyalty-betrayal,499
3,care-harm|fairness-cheating,218
1,care-harm|authority-subversion,194
9,care-harm|sanctity-degradation,166
5,care-harm|fairness-cheating|loyalty-betrayal,19
2,care-harm|authority-subversion|sanctity-degrad...,13
4,care-harm|fairness-cheating|authority-subversion,6
7,care-harm|loyalty-betrayal|authority-subversion,3
8,care-harm|loyalty-betrayal|sanctity-degradation,3
