# Sampling multitarget-CONAN

In [14]:
import pandas as pd
import os

# Load the dataset
input_file = 'data/mtc/Multitarget-CONAN.csv'
df = pd.read_csv(input_file)

# Filter rows where COUNTER_NARRATIVE length is greater than 30
filtered_df = df[df['COUNTER_NARRATIVE'].str.len() > 30]

# Get the minimum count of each target
min_count = filtered_df['TARGET'].value_counts().min()

# Sample equal number of rows for each target, using the 'INDEX' for reference
sample_df = filtered_df.groupby('TARGET').apply(lambda x: x.sample(n=min_count, random_state=1)).reset_index(drop=True)

# Get indices of the sampled rows based on 'INDEX' to create the remaining dataset
sampled_indices = sample_df['INDEX'].values
remaining_df = df[~df['INDEX'].isin(sampled_indices)]

# Randomize the DataFrame
sample_df = sample_df.sample(frac=1, random_state=1).reset_index(drop=True)
remaining_df = remaining_df.sample(frac=1, random_state=1).reset_index(drop=True)

# Save the sample and remaining datasets
output_sample_file = 'data/mtc/sample_mtc.csv'
output_remaining_file = 'data/mtc/remaining_mtc.csv'

# Ensure the directory exists before saving
os.makedirs(os.path.dirname(output_sample_file), exist_ok=True)

# Save DataFrames to CSV
sample_df.to_csv(output_sample_file, index=False)
remaining_df.to_csv(output_remaining_file, index=False)

# Output the shapes of the dataframes to verify sizes
print("Sampled DataFrame shape:", sample_df.shape)
print("Remaining DataFrame shape:", remaining_df.shape)


Sampled DataFrame shape: (1760, 5)
Remaining DataFrame shape: (3243, 5)


  sample_df = filtered_df.groupby('TARGET').apply(lambda x: x.sample(n=min_count, random_state=1)).reset_index(drop=True)


# Long-CS Sample

In [15]:
import pandas as pd

# Load the dataset
input_file = 'data/mtc/Multitarget-CONAN.csv'
df = pd.read_csv(input_file)

# Filter rows where COUNTER_NARRATIVE length is greater than 100
filtered_df = df[df['COUNTER_NARRATIVE'].str.len() > 100]

# Randomize the DataFrame
filtered_df = filtered_df.sample(frac=1, random_state=1).reset_index(drop=True)

output_sample_file = 'data/mtc/longCS_sample_mtc.csv'

# Save DataFrames to CSV
filtered_df.to_csv(output_sample_file, index=False)

# Output the shapes of the dataframes to verify sizes
print("Sampled DataFrame shape:", filtered_df.shape)



Sampled DataFrame shape: (3615, 5)
