In [28]:
import pandas as pd

In [48]:
# Download all datasets and unified their columns

data_list = ['companies', 'dates', 'goods', 'location', 'random_str']

for name in data_list:
    df = pd.read_csv(f"Sub_datasets/{name}.csv")

    #  Drop accidental index columns like "Unnamed: 0"
    df = df.loc[:, ~df.columns.astype(str).str.match(r"^Unnamed")]

    # Take the column that matches the dataset name if present)
    if name in df.columns:
        df = df[[name]]
    elif df.shape[1] > 1:
        # otherwise keep the first non-numeric/object-like column if possible
        obj_cols = [c for c in df.columns if df[c].dtype == "object"]
        df = df[[obj_cols[0]]] if obj_cols else df[[df.columns[-1]]]

    #  Standardize schema
    df = df.rename(columns={df.columns[0]: "Item"})
    df["Tag"] = name

    globals()[name] = df


#  Concat all dataframes

master = pd.concat(
    [globals()[name] for name in data_list],
    ignore_index=True
)
master.head()

Unnamed: 0,Item,Tag
0,Central Adams LLP,companies
1,First Industries & Associates,companies
2,Young Ventures Inc.,companies
3,Royal Foods Co.,companies
4,Western Clark & Co.,companies


In [32]:
#  Class distbution

print(master["Tag"].value_counts())

Tag
location      96118
companies     20000
dates         20000
random_str    20000
goods          5595
Name: count, dtype: int64


### Create two dataset:
#### 1. Keep all classes ~20k where possible. Later use class weights in the models. 
#### 2. De-sample all classes ~5.5K. In case it will be enough to train the model

###### ~20k where possible

In [49]:
# Desample location class

# Separate location and non-location
location_df = master[master["Tag"] == "location"]
other_df = master[master["Tag"] != "location"]

# Randomly sample 20,000 locations
location_sampled = location_df.sample(
    n=20000,
    random_state=42   # for reproducibility
)

# Recombine
master_balanced = pd.concat(
    [other_df, location_sampled],
    ignore_index=True
)

print(master_balanced["Tag"].value_counts())

Tag
companies     20000
dates         20000
random_str    20000
location      20000
goods          5595
Name: count, dtype: int64


In [51]:
master_balanced.to_csv('Train_dataset.csv')

###### ~5.5K all classes

In [52]:
# Desample all classes to ~5.5K


min_count = master["Tag"].value_counts().min()

master_equal = (
    master.groupby("Tag")["Item"]
          .apply(lambda s: s.sample(n=min_count, random_state=42), include_groups=False)
          .reset_index(name="Item")     # brings Tag back as a column
)

# Optional shuffle
master_equal = master_equal.sample(frac=1, random_state=42).reset_index(drop=True)

print(master_equal["Tag"].value_counts())



Tag
dates         5595
random_str    5595
location      5595
companies     5595
goods         5595
Name: count, dtype: int64


In [54]:
master_equal.to_csv('Train_dataset_light.csv')