# Sampling IntentCONANv2

#### Group by hatespeechTarget and csType, then filter groups with at least 2 samples

In [23]:
import pandas as pd

# Load datasets
train_df = pd.read_csv('data/iconanv2/train.csv')
test_df = pd.read_csv('data/iconanv2/test.csv')
val_df = pd.read_csv('data/iconanv2/val.csv')

# Combine datasets
df = pd.concat([train_df, test_df, val_df])

# Drop duplicates to ensure distinct hatespeech
#df = df.drop_duplicates(subset=['hatespeech'])   // Commenting this out since this is resulting in 100-102 rows only in sample, i.e., unable to produce 2 rows per hatespeechTarget-csType pair

# Filter by counterspeech length > 105
df = df[df['counterspeech'].apply(len) > 105]

# Filter columns of interest
df = df[['hatespeech', 'counterspeech', 'csType', 'hs_id', 'id', 'hatespeechTarget', 'powerDynamics', 'hatespeechOffensiveness']]

# Group by hatespeechTarget and csType, then filter groups with at least 2 samples
filtered_groups = df.groupby(['hatespeechTarget', 'csType']).filter(lambda x: len(x) >= 2)

# Sample 2 rows from each group
sample_df = filtered_groups.groupby(['hatespeechTarget', 'csType']).apply(lambda x: x.sample(2, replace=False)).reset_index(drop=True)

# Verify the condition for each hatespeechTarget and csType pair
sample_counts = sample_df.groupby(['hatespeechTarget', 'csType']).size().reset_index(name='count')
if all(sample_counts['count'] == 2):
    print("Sample meets the required conditions.")
else:
    print("Sample does not meet the required conditions.")

# Save the sample to a new CSV file
sample_df.to_csv('data/iconanv2/sample_iconanv2.csv', index=False)

print("Sample saved to 'sample_iconanv2.csv'")


Sample meets the required conditions.
Sample saved to 'sample_iconanv2.csv'


  sample_df = filtered_groups.groupby(['hatespeechTarget', 'csType']).apply(lambda x: x.sample(2, replace=False)).reset_index(drop=True)


#### Non-uniform sample of longer counterspeech instances

In [24]:
# Combine datasets
df = pd.concat([train_df, test_df, val_df])

# Drop duplicates to ensure distinct hatespeech
#df = df.drop_duplicates(subset=['hatespeech'])   // Commenting this out since this is resulting in 100-102 rows only in sample, i.e., unable to produce 2 rows per hatespeechTarget-csType pair

# Filter by counterspeech length > 200
df = df[df['counterspeech'].apply(len) > 200]

# Filter columns of interest
df = df[['hatespeech', 'counterspeech', 'csType', 'hs_id', 'id', 'hatespeechTarget', 'powerDynamics', 'hatespeechOffensiveness']]

# Group by hatespeechTarget and csType, then filter groups with at least 2 samples
filtered_groups = df.groupby(['hatespeechTarget', 'csType']).filter(lambda x: len(x) >= 1)

# Sample 2 rows from each group
sample_df = filtered_groups.groupby(['hatespeechTarget', 'csType']).apply(lambda x: x.sample(1, replace=False)).reset_index(drop=True)

# Verify the condition for each hatespeechTarget and csType pair
sample_counts = sample_df.groupby(['hatespeechTarget', 'csType']).size().reset_index(name='count')
if all(sample_counts['count'] == 1):
    print("Sample meets the required conditions.")
else:
    print("Sample does not meet the required conditions.")

# Save the sample to a new CSV file
sample_df.to_csv('data/iconanv2/sample_iconanv2_longCS.csv', index=False)

print("Sample saved to 'sample_iconanv2_longCS.csv'")


Sample meets the required conditions.
Sample saved to 'sample_iconanv2_longCS.csv'


  sample_df = filtered_groups.groupby(['hatespeechTarget', 'csType']).apply(lambda x: x.sample(1, replace=False)).reset_index(drop=True)


# Sample: Targets Equally Represented

In [3]:
import pandas as pd

# Load the datasets
train_df = pd.read_csv('data/intentconanv2/train.csv')
test_df = pd.read_csv('data/intentconanv2/test.csv')
val_df = pd.read_csv('data/intentconanv2/val.csv')

# Combine datasets
df = pd.concat([train_df, test_df, val_df])

# Remove HS-CS pairwise duplicates
df = df.drop_duplicates(subset=['hatespeech', 'counterspeech'])

# Save the combined dataset
df.to_csv('data/intentconanv2/combined.csv', index=False)

# Print the combined target distribution
target_distribution = df['hatespeechTarget'].value_counts()
print("Combined Target Distribution:")
print(target_distribution)

# Adjust hate speech targets as per the instructions
df['hatespeechTarget'] = df['hatespeechTarget'].replace({
    'men': 'other',
    'native_americans': 'other',
    'gypsies': 'other_people_of_color',
    'people_of_color': 'other_people_of_color',
    'asian_people': 'other_people_of_color'
})

# Print the new target distribution
target_distribution = df['hatespeechTarget'].value_counts()
print("New Target Distribution:")
print(target_distribution)

# Generate a random sample with equal target representation
min_count = target_distribution.min()  # Get the minimum count for equal sampling
equal_sample_df = df.groupby('hatespeechTarget').apply(lambda x: x.sample(min_count)).reset_index(drop=True)

# Save the equal target sample
equal_sample_df.to_csv('data/intentconanv2/equal-target-sample.csv', index=False)





Combined Target Distribution:
hatespeechTarget
muslims                            3652
immigrants                         2246
women                              2031
lgbtq+                             1796
jews                               1568
black_people                       1127
people_with_mental_disability       388
people_with_physical_disability     304
refugees                            280
other                               208
asian_people                        115
gypsies                              92
people_of_color                      80
men                                  40
native_americans                     16
Name: count, dtype: int64
New Target Distribution:
hatespeechTarget
muslims                            3652
immigrants                         2246
women                              2031
lgbtq+                             1796
jews                               1568
black_people                       1127
people_with_mental_disability       388
peopl

  equal_sample_df = df.groupby('hatespeechTarget').apply(lambda x: x.sample(min_count)).reset_index(drop=True)
