# Import Libraries

In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the Dataset

In [2]:
dataset_file_location = '/content/drive/MyDrive/B.Tech. Final Year Project/Offensive Language Detection/Datasets/m_dataset_21_9/cleaned_merged_dataset_21_9_arnab.csv'

df = pd.read_csv(dataset_file_location)
print(df.count())
print('\nHate Class Count')
print(df['hate'].value_counts())
print('\nHate Class Count Ratio')
print(df['hate'].value_counts(normalize=True))

text    90345
hate    90345
dtype: int64

Hate Class Count
0    52028
1    38317
Name: hate, dtype: int64

Hate Class Count Ratio
0    0.575881
1    0.424119
Name: hate, dtype: float64


# Train-Test-Validation Split

In [3]:
X_train_val, X_test, y_train_val, y_test = train_test_split(
    df['text'], df['hate'],
    test_size=0.20, random_state=42, stratify=df['hate']
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val,
    test_size=0.125, random_state=42, stratify=y_train_val
)

# Combine the Data and Label into 1 Variable

In [4]:
df_train = pd.concat([X_train, y_train], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)
df_val = pd.concat([X_val, y_val], axis=1)

# Save the Dataset in a Folder

In [5]:
df_folder_name = dataset_file_location.rsplit('/', 1)[0] + '/dataset_12_10/'
if not os.path.exists(df_folder_name):
    os.makedirs(df_folder_name)
df_train.to_csv(df_folder_name+'train.csv', index=False)
df_test.to_csv(df_folder_name+'test.csv', index=False)
df_val.to_csv(df_folder_name+'val.csv', index=False)

# Save The Details of the Split

In [6]:
data_count_len = len(str(df['text'].count()))
total_data_count = df.count()['hate']
df_details = ''
df_details += f'     Total Data Count: %{data_count_len}d'%(df['text'].count()) + '\n'
df_details += f'     Train Data Count: %{data_count_len}d (%0.2f%%)'%(X_train.count(), (X_train.count()/total_data_count)*100) + '\n'
df_details += f'      Test Data Count: %{data_count_len}d (%0.2f%%)'%(X_test.count(), (X_test.count()/total_data_count)*100) + '\n'
df_details += f'Validation Data Count: %{data_count_len}d (%0.2f%%)'%(X_val.count(), (X_val.count()/total_data_count)*100) + '\n'

In [7]:
with open(df_folder_name+'dataset_details.txt', 'w') as f:
    f.write(df_details)