# Libraries

In [7]:
import pandas as pd

# Loading the Dataset

In [8]:
df = pd.read_csv("TRNcod.xls",sep="\t")
print(f"DATAFRAME SIZE: {len(df)}")

DATAFRAME SIZE: 389196


# Separating Dataframe in Classes

In [9]:
df_1_1=df[df["IND_BOM_1_1"]==1]
df_1_2=df[df["IND_BOM_1_2"]==1]
print("DATAFRAME SIZE:")
print(f"IND_BOM_1_1: {len(df_1_1)}")
print(f"IND_BOM_1_2: {len(df_1_2)}")

DATAFRAME SIZE:
IND_BOM_1_1: 255098
IND_BOM_1_2: 134098


# Randomizing DataFrame

In [10]:
df_1_1=df_1_1.sample(frac=1, random_state=1).reset_index(drop=True)
df_1_2=df_1_2.sample(frac=1, random_state=1).reset_index(drop=True)

# Separating Data in Train, Validation and Test

## IND_BOM_1_1

In [11]:
df_1_1_train = df_1_1.sample(frac=0.5)
df_1_1 = df_1_1.drop(df_1_1_train.index)
df_1_1_val = df_1_1.sample(frac=0.5)
df_1_1_test = df_1_1.drop(df_1_1_val.index)

df_1_1_train = df_1_1_train.reset_index(drop=True)
df_1_1_val = df_1_1_val.reset_index(drop=True)
df_1_1_test = df_1_1_test.reset_index(drop=True)

In [12]:
print("DATAFRAME SIZE:")
print(f"TRAIN DATA: {len(df_1_1_train)}")
print(f"VALIDATION DATA: {len(df_1_1_val)}")
print(f"TEST DATA: {len(df_1_1_test)}")

DATAFRAME SIZE:
TRAIN DATA: 127549
VALIDATION DATA: 63774
TEST DATA: 63775


## IND_BOM_1_2

In [13]:
df_1_2_train = df_1_2.sample(frac=0.5)
df_1_2 = df_1_2.drop(df_1_2_train.index)
df_1_2_val = df_1_2.sample(frac=0.5)
df_1_2_test = df_1_2.drop(df_1_2_val.index)

df_1_2_train = df_1_2_train.reset_index(drop=True)
df_1_2_val = df_1_2_val.reset_index(drop=True)
df_1_2_test = df_1_2_test.reset_index(drop=True)


In [14]:
print("DATAFRAME SIZE:")
print(f"TRAIN DATA: {len(df_1_2_train)}")
print(f"VALIDATION DATA: {len(df_1_2_val)}")
print(f"TEST DATA: {len(df_1_2_test)}")

DATAFRAME SIZE:
TRAIN DATA: 67049
VALIDATION DATA: 33524
TEST DATA: 33525


# Concatenating the Dataset with the Minority Class with Its first rows

In [15]:
df_1_2_train = pd.concat((df_1_2_train, df_1_2_train.iloc[:len(df_1_1_train)-len(df_1_2_train)]), axis = 0)
df_1_2_val = pd.concat((df_1_2_val, df_1_2_val.iloc[:len(df_1_1_val)-len(df_1_2_val)]), axis = 0)

In [16]:
print("DATAFRAME SIZE:")
print(f"TRAIN DATA: {len(df_1_2_train)}")
print(f"VALIDATION DATA: {len(df_1_2_val)}")
print(f"TEST DATA: {len(df_1_2_test)}")

DATAFRAME SIZE:
TRAIN DATA: 127549
VALIDATION DATA: 63774
TEST DATA: 33525


# Merging Dataset Containing Both Classes

In [17]:
train_data = pd.concat((df_1_1_train, df_1_2_train), axis = 0)
val_data = pd.concat((df_1_1_val, df_1_2_val), axis = 0)
test_data = pd.concat((df_1_1_test, df_1_2_test), axis = 0)

In [18]:
train_data = train_data.sample(frac=1, random_state=1).reset_index(drop=True)
val_data = val_data.sample(frac=1, random_state=1).reset_index(drop=True)
test_data = test_data.sample(frac=1, random_state=1).reset_index(drop=True)

In [19]:
print("DATAFRAME SIZE:")
print(f"TRAIN DATA: {len(train_data)}")
print(f"VALIDATION DATA: {len(val_data)}")
print(f"TEST DATA: {len(test_data)}")

DATAFRAME SIZE:
TRAIN DATA: 255098
VALIDATION DATA: 127548
TEST DATA: 97300


# Checking if The Steps were Done Correct

## Checking for Intersection Between the Data

In [20]:
train_index = set(train_data["INDEX"])
val_index = set(val_data["INDEX"])
test_index = set(test_data["INDEX"])

In [21]:
print("DATAFRAME SIZE:")
print(f"TRAIN DATA: {len(df_1_2_train)}")
print(f"VALIDATION DATA: {len(df_1_2_val)}")
print(f"TEST DATA: {len(df_1_2_test)}")

DATAFRAME SIZE:
TRAIN DATA: 127549
VALIDATION DATA: 63774
TEST DATA: 33525


In [22]:
train_val_intersection = train_index.intersection(val_index)
train_test_intersection = train_index.intersection(test_index)
val_test_intersection = val_index.intersection(test_index)

assert not train_val_intersection, f"Intersection {train_val_intersection} found between train and validation datasets!"
assert not train_test_intersection, f"Intersection {train_test_intersection} found between train and test datasets!"
assert not train_test_intersection, f"Intersection {val_test_intersection} found between validation and test datasets!"

## Checking if Both Classes have The Same Amount of Data

In [23]:
assert len(train_data[train_data["IND_BOM_1_1"]==1]) == len(train_data[train_data["IND_BOM_1_2"]==1]), "Train data classes have mismatching sizes!"
assert len(val_data[val_data["IND_BOM_1_1"]==1]) == len(val_data[val_data["IND_BOM_1_2"]==1]), "Validation data classes have mismatching sizes!"

# Saving Data to CSV File

In [24]:
train_data.to_csv("train.csv")
val_data.to_csv("val.csv")
test_data.to_csv("test.csv")