# Sampling iconan

#### Group by Target and cnType, then filter groups with at least 2 samples

In [3]:
import pandas as pd

# Load datasets
train_df = pd.read_csv('data/iconan/iconan_train.csv')
test_df = pd.read_csv('data/iconan/iconan_test.csv')
val_df = pd.read_csv('data/iconan/iconan_dev.csv')

# Combine datasets
df = pd.concat([train_df, test_df, val_df])

# Drop duplicates to ensure distinct hateSpeech
#df = df.drop_duplicates(subset=['hateSpeech'])   // Commenting this out since this is resulting in 100-102 rows only in sample, i.e., unable to produce 2 rows per Target-cnType pair

# Filter by counterSpeech length > 105
df = df[df['counterSpeech'].apply(len) > 105]

# Group by Target and cnType, then filter groups with at least 2 samples
filtered_groups = df.groupby(['Target', 'cnType']).filter(lambda x: len(x) >= 2)

# Sample 2 rows from each group
sample_df = filtered_groups.groupby(['Target', 'cnType']).apply(lambda x: x.sample(2, replace=False)).reset_index(drop=True)

# Verify the condition for each Target and cnType pair
sample_counts = sample_df.groupby(['Target', 'cnType']).size().reset_index(name='count')
if all(sample_counts['count'] == 2):
    print("Sample meets the required conditions.")
else:
    print("Sample does not meet the required conditions.")

# Save the sample to a new CSV file
sample_df.to_csv('data/iconan/sample_iconan.csv', index=False)

print("Sample saved to 'sample_iconan.csv'")


Sample meets the required conditions.
Sample saved to 'sample_iconan.csv'


  sample_df = filtered_groups.groupby(['Target', 'cnType']).apply(lambda x: x.sample(2, replace=False)).reset_index(drop=True)


#### Non-uniform sample of longer counterSpeech instances

In [4]:
# Combine datasets
df = pd.concat([train_df, test_df, val_df])

# Drop duplicates to ensure distinct hateSpeech
#df = df.drop_duplicates(subset=['hateSpeech'])   // Commenting this out since this is resulting in 100-102 rows only in sample, i.e., unable to produce 2 rows per Target-cnType pair

# Filter by counterSpeech length > 200
df = df[df['counterSpeech'].apply(len) > 200]

# Group by Target and cnType, then filter groups with at least 2 samples
filtered_groups = df.groupby(['Target', 'cnType']).filter(lambda x: len(x) >= 1)

# Sample 2 rows from each group
sample_df = filtered_groups.groupby(['Target', 'cnType']).apply(lambda x: x.sample(1, replace=False)).reset_index(drop=True)

# Verify the condition for each Target and cnType pair
sample_counts = sample_df.groupby(['Target', 'cnType']).size().reset_index(name='count')
if all(sample_counts['count'] == 1):
    print("Sample meets the required conditions.")
else:
    print("Sample does not meet the required conditions.")

# Save the sample to a new CSV file
sample_df.to_csv('data/iconan/sample_iconan_longCS.csv', index=False)

print("Sample saved to 'sample_iconan_longCS.csv'")


Sample meets the required conditions.
Sample saved to 'sample_iconan_longCS.csv'


  sample_df = filtered_groups.groupby(['Target', 'cnType']).apply(lambda x: x.sample(1, replace=False)).reset_index(drop=True)


# Sample: Targets Equally Represented

In [6]:
import pandas as pd

# Load the datasets
train_df = pd.read_csv('data/iconan/iconan_train.csv')
test_df = pd.read_csv('data/iconan/iconan_test.csv')
val_df = pd.read_csv('data/iconan/iconan_dev.csv')

# Combine datasets
df = pd.concat([train_df, test_df, val_df])

# Remove HS-CS pairwise duplicates
df = df.drop_duplicates(subset=['hateSpeech', 'counterSpeech'])

# Save the combined dataset
df.to_csv('data/iconan/combined.csv', index=False)

# Print the combined target distribution
target_distribution = df['Target'].value_counts()
print("Combined Target Distribution:")
print(target_distribution)

# Generate a random sample with equal target representation
min_count = target_distribution.min()  # Get the minimum count for equal sampling
equal_sample_df = df.groupby('Target').apply(lambda x: x.sample(min_count)).reset_index(drop=True)

# Randomize the DataFrame
equal_sample_df = equal_sample_df.sample(frac=1).reset_index(drop=True)

# Save the equal target sample
equal_sample_df.to_csv('data/iconan/equal-target-sample.csv', index=False)

Combined Target Distribution:
Target
MUSLIMS     5087
MIGRANTS     946
WOMEN        662
LGBT+        617
JEWS         594
POC          352
other        268
DISABLED     220
Name: count, dtype: int64


  equal_sample_df = df.groupby('Target').apply(lambda x: x.sample(min_count)).reset_index(drop=True)


# 40-per-Target Sample

In [9]:
import pandas as pd

df = pd.read_csv('data/iconan/combined.csv')

#Filter by counterspeech length > 150
df = df[df['counterSpeech'].apply(len) > 150]

target_distribution = df['Target'].value_counts()
print("Combined Target Distribution:")
print(target_distribution)

# Generate a random sample with equal (40) target representation
min_count = 40
equal_sample_df = df.groupby('Target').apply(lambda x: x.sample(min_count)).reset_index(drop=True)

# Randomize the DataFrame
equal_sample_df = equal_sample_df.sample(frac=1).reset_index(drop=True)

# Save the equal target (40) sample
equal_sample_df.to_csv('data/iconan/40-per-target-sample.csv', index=False)



Combined Target Distribution:
Target
MUSLIMS     1591
MIGRANTS     343
JEWS         251
LGBT+        248
WOMEN        234
POC          155
DISABLED     101
other         91
Name: count, dtype: int64


  equal_sample_df = df.groupby('Target').apply(lambda x: x.sample(min_count)).reset_index(drop=True)
