# Explore & Split the dataset into train and test sets

## Simple EDA

In [None]:
import pandas as pd
from pathlib import Path

In [None]:
data_path = Path("../datasets/cars.csv")
cars = pd.read_csv(data_path)

cars.head()

In [None]:
cars.shape

In [None]:
cars.info()

In [None]:
cars.describe()

In [None]:
cars.describe(include="object")

In [None]:
cars.hist(bins=50, figsize=(12, 8))

In [None]:
import matplotlib.pyplot as plt

cat_features = cars.select_dtypes(include=['object']).columns

for feature in cat_features:
    counts = cars[feature].value_counts()

    plt.bar(counts.index, counts.values)
    plt.xticks(rotation=45)
    plt.ylabel("count")
    plt.xlabel(feature)
    plt.title("Category distribution")
    plt.show()


# Split the dataset

We will use stratified sampling, so we can take care for each car manufactor

In [None]:
name_counts = cars['name'].value_counts()
name_counts

In [None]:
len(name_counts[name_counts > 10])

In [None]:
# get the first word from cars['name'] and put it in new column 'manufactor'
cars['manufactor'] = cars['name'].str.split(n=1).str[0]

In [None]:
cars.head()

In [None]:
threshold = 10
freq = cars['manufactor'].value_counts()
rare_manufactors = freq[freq < threshold].index

cars['manufactor'] = cars['manufactor'].replace(rare_manufactors, 'other')

cars['manufactor'].value_counts()


In [None]:
counts = cars['manufactor'].value_counts()

plt.bar(counts.index, counts.values)
plt.xticks(rotation=45)
plt.ylabel("count")
plt.xlabel('manufactor')
plt.title("Category distribution")
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

strat_train_set, strat_test_set = train_test_split(cars, test_size=0.2, random_state=42, stratify=cars['manufactor'])

In [None]:
strat_train_set.shape

In [None]:
strat_test_set.shape

In [None]:
strat_train_set.head()

In [None]:
def manufactor_proportions(data):
    return data["manufactor"].value_counts() / len(data)

train_set, test_set = train_test_split(cars, test_size=0.2,
                                       random_state=42)

compare_props = pd.DataFrame({
    "Overall %": manufactor_proportions(cars),
    "Stratified %": manufactor_proportions(strat_test_set),
    "Random %": manufactor_proportions(test_set),
}).sort_index()
compare_props.index.name = "Income Category"
compare_props["Strat. Error %"] = (compare_props["Stratified %"] /
                                   compare_props["Overall %"] - 1)
compare_props["Rand. Error %"] = (compare_props["Random %"] /
                                  compare_props["Overall %"] - 1)
(compare_props * 100).round(2)

In [None]:
train_set.drop(columns=['manufactor'], inplace=True)
test_set.drop(columns=['manufactor'], inplace=True)
train_set.head()

In [None]:
train_path = Path("../datasets/train.csv")
test_path = Path("../datasets/test.csv")

train_set.to_csv(train_path, index=False)
test_set.to_csv(test_path, index=False)