# 데이터 전처리 방안 수정
- `04_04_FE_.ipynb`의 내용을 기본적으로 유지
- Price Trend 피처 강화
- Revenue 피처 : item_price * item_cnt_day 로 매출액을 구한 뒤, 이에 대한 Lag feature를 추가
- ratio 피처 수정

# Module

In [1]:
import pandas as pd
import numpy as np
from itertools import product
import joblib
from sklearn.preprocessing import LabelEncoder

# Data

In [2]:
data_path = "./data/"

sales_train = pd.read_csv(data_path + 'sales_train.csv')
items = pd.read_csv(data_path + 'items.csv')
item_categories = pd.read_csv(data_path + 'item_categories.csv')
shops = pd.read_csv(data_path + 'shops.csv')

test = pd.read_csv(data_path + 'test.csv')
sub = pd.read_csv(data_path + 'sample_submission.csv')

print('Before Filter ShopID:', len(sales_train))
unique_test_shop_id = test['shop_id'].unique()
sales_train = sales_train[sales_train['shop_id'].isin(unique_test_shop_id)]
print('After Filter ShopID :', len(sales_train))

Before Filter ShopID: 2935849
After Filter ShopID : 2413246


In [3]:
def downcast(df, verbose=True):
    start_memory = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col]%1==0).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        elif dtype_name.startswith('float'):
            df[col] = pd.to_numeric(df[col], downcast='float')
    end_memory = df.memory_usage().sum() / 1024**2
    
    if verbose:
        print(f"Memory usage reduced from {start_memory:.2f} MB to {end_memory:.2f} MB")
        
    return df


data_files = [sales_train, items, shops, item_categories, test]
for file in data_files:
    file = downcast(file)

Memory usage reduced from 128.88 MB to 69.04 MB
Memory usage reduced from 0.51 MB to 0.23 MB
Memory usage reduced from 0.00 MB to 0.00 MB
Memory usage reduced from 0.00 MB to 0.00 MB
Memory usage reduced from 4.90 MB to 1.43 MB


In [4]:
train = []

for i in sales_train['date_block_num'].unique():
    all_shop = sales_train.loc[sales_train['date_block_num'] == i, 'shop_id'].unique()
    all_item = sales_train.loc[sales_train['date_block_num'] == i, 'item_id'].unique()
    train.append(np.array(list(product([i], all_shop, all_item))))

# idx features        
idx_features = ['date_block_num', 'shop_id', 'item_id']
train = pd.DataFrame(np.vstack(train), columns=idx_features)

test['date_block_num'] = 34
train.head()

Unnamed: 0,date_block_num,shop_id,item_id
0,0,59,22154
1,0,59,2552
2,0,59,2554
3,0,59,2555
4,0,59,2564


# Feature Engineering

## Outliers

In [5]:
print("Before : ",len(sales_train))
sales_train = sales_train[(sales_train['item_cnt_day'] > 0)&(sales_train['item_cnt_day'] < 1000)]
print("After item_cnt_day : ",len(sales_train))
sales_train = sales_train[(sales_train['item_price'] > 0)&(sales_train['item_price'] < 50000)]
print("After item_price : ",len(sales_train))

Before :  2413246
After item_cnt_day :  2407079
After item_price :  2407076


## Feature Generation

In [6]:
def add_mean_features(df: pd.DataFrame, groupby_features: list, mean_feature_list: list = None):
    col_name = [col for col in groupby_features if col != 'date_block_num']
    base_name = '_'.join([col.replace('_id', '') for col in col_name])
    feature_name = f'{base_name}_avg_date_sales'
    
    agg_rules = {
        feature_name: ('item_cnt_month', 'mean')
    }
            
    group = df.groupby(groupby_features).agg(**agg_rules).reset_index()
    
    df = df.merge(group, on=groupby_features, how='left')
    df = downcast(df, verbose=False)
    del group
    
    if mean_feature_list is not None:
        mean_feature_list.append(feature_name)
    
    return df


def add_lag_features(df: pd.DataFrame, key_features: list, lag_features_info: dict, lag_period: list):
    """
    lag_features_info: {'feature_name_1': clip_flag_1, 'feature_name_2': clip_flag_2, ...}
    """
    
    group_keys = [key for key in key_features if key != 'date_block_num']
    grouped = df.groupby(group_keys)
    
    all_lag_cols = []
    features_to_clip = []

    for lag_feature, clip_flag in lag_features_info.items():
        for i in lag_period:
            lag_feature_name = f'{lag_feature}_lag_{i}'
            df[lag_feature_name] = grouped[lag_feature].shift(i)
            
            all_lag_cols.append(lag_feature_name)
            if clip_flag:
                features_to_clip.append(lag_feature_name)

    df[all_lag_cols] = df[all_lag_cols]
    
    return df, features_to_clip

### shops

In [7]:
# 데이터 수정
sales_train.loc[sales_train['shop_id']==0, 'shop_id'] = 57
sales_train.loc[sales_train['shop_id']==1, 'shop_id'] = 58
sales_train.loc[sales_train['shop_id']==10, 'shop_id'] = 11
sales_train.loc[sales_train['shop_id']==39, 'shop_id'] = 40

test.loc[test['shop_id']==0, 'shop_id'] = 57
test.loc[test['shop_id']==1, 'shop_id'] = 58
test.loc[test['shop_id']==10, 'shop_id'] = 11
test.loc[test['shop_id']==39, 'shop_id'] = 40

# 도시 구분
shops['city'] = shops['shop_name'].str.split(' ').str[0]
shops.loc[shops['city'] == '!Якутск', 'city'] = 'Якутск'

# 쇼핑몰/센터를 나타내는 러시아어 약어로 상점 구분
mall_keywords = ['ТЦ', 'ТРК', 'ТРЦ', 'MALL', 'Молл']
is_mall = shops['shop_name'].apply(lambda x: any(keyword in x for keyword in mall_keywords))

shops['shop_type'] = np.where(is_mall, 'Mall', 'Standalone')

encoder = LabelEncoder()
for col in ['city', 'shop_type']:
    shops[col] = encoder.fit_transform(shops[col])

shops = shops.drop(columns=["shop_name"])

`shop_type` 만 추가

### items

In [8]:
items = items.drop(columns=["item_name"])
items['first_sale_month'] = sales_train.groupby('item_id').agg({'date_block_num': 'min'})['date_block_num']
items = items.fillna(34)

##### 아이템별 평균가격

In [9]:
# 아이템별 평균 가격
group = sales_train.groupby(['item_id']).agg(item_avg_price=('item_price', 'mean')).reset_index()
items = items.merge(group, on='item_id', how='left')
items.head()

Unnamed: 0,item_id,item_category_id,first_sale_month,item_avg_price
0,0,40,34.0,
1,1,76,15.0,4490.0
2,2,40,34.0,
3,3,40,34.0,
4,4,40,34.0,


In [10]:
temp = sales_train.merge(items[['item_id', 'item_category_id']], on='item_id', how='left')
cat_price_map = temp.groupby('item_category_id')['item_price'].mean()
items['item_avg_price'] = items['item_avg_price'].fillna(items['item_category_id'].map(cat_price_map))
item_avg_price = items[['item_id', 'item_avg_price']]

del temp, cat_price_map, items['item_avg_price']

item_avg_price.head()

Unnamed: 0,item_id,item_avg_price
0,0,259.880724
1,1,4490.0
2,2,259.880724
3,3,259.880724
4,4,259.880724


##### 월별 아이템 평균가격

In [11]:
date_item_avg_price = sales_train.groupby(['date_block_num', 'item_id']).agg(date_item_avg_price=('item_price', 'mean')).reset_index()
date_item_avg_price.head()

Unnamed: 0,date_block_num,item_id,date_item_avg_price
0,0,19,28.0
1,0,27,2397.5
2,0,28,549.0
3,0,29,2499.0
4,0,32,348.889199


### item_categories

In [12]:
def split_categories(df):
    def split_func(name):
        if '-' in name:
            return name.split('-', 1)
        else:
            return [name, name]
    
    split_result = df['item_category_name'].apply(lambda x: split_func(x))
    df['type'] = split_result.apply(lambda x: x[0].strip())
    df['subtype'] = split_result.apply(lambda x: x[1].strip())
    return df


item_categories = split_categories(item_categories)
etc_types = item_categories['type'].value_counts()[item_categories['type'].value_counts()<5].index.tolist()
item_categories.loc[item_categories['type'].isin(etc_types), 'type'] = 'etc'
item_categories['type'] = LabelEncoder().fit_transform(item_categories['type'])
item_categories['subtype'] = LabelEncoder().fit_transform(item_categories['subtype'])
item_categories = item_categories.drop('item_category_name', axis=1)

### revenue

In [13]:
sales_train['revenue'] = sales_train['item_cnt_day'] * sales_train['item_price']
sales_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,revenue
0,02.01.2013,0,59,22154,999.0,1,999.0
1,03.01.2013,0,25,2552,899.0,1,899.0
3,06.01.2013,0,25,2554,1709.05,1,1709.05
4,15.01.2013,0,25,2555,1099.0,1,1099.0
5,10.01.2013,0,25,2564,349.0,1,349.0


### set up matrix

In [14]:
group = sales_train.groupby(idx_features).agg(
    item_cnt_month = ('item_cnt_day', 'sum'),
    transaction_cnt = ('item_cnt_day', 'count'),
    mean_item_price = ('item_price', 'mean'),
    date_revenue = ('revenue', 'sum')
).reset_index()

train = train.merge(group, on=idx_features, how='left')
all_data = pd.concat([train, test.drop('ID', axis=1)], ignore_index=True, keys=idx_features)
all_data = all_data.merge(shops, on='shop_id', how='left')
all_data = all_data.merge(items, on='item_id', how='left')
all_data = all_data.merge(item_avg_price, on='item_id', how='left')
all_data = all_data.merge(date_item_avg_price, on=['date_block_num', 'item_id'], how='left')
all_data = all_data.merge(item_categories, on='item_category_id', how='left')
all_data.head()

  all_data = pd.concat([train, test.drop('ID', axis=1)], ignore_index=True, keys=idx_features)


Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,transaction_cnt,mean_item_price,date_revenue,city,shop_type,item_category_id,first_sale_month,item_avg_price,date_item_avg_price,type,subtype
0,0,59,22154,1.0,1.0,999.0,999.0,30,0,37,0.0,678.166667,999.0,4,1
1,0,59,2552,,,,,30,0,58,0.0,940.666667,899.0,6,29
2,0,59,2554,,,,,30,0,58,0.0,1709.05,1709.05,6,29
3,0,59,2555,,,,,30,0,56,0.0,1122.076923,1099.0,6,5
4,0,59,2564,,,,,30,0,59,0.0,341.307692,349.0,6,42


In [15]:
all_data.isna().sum()

date_block_num               0
shop_id                      0
item_id                      0
item_cnt_month         7513914
transaction_cnt        7513914
mean_item_price        7513914
date_revenue           7513914
city                         0
shop_type                    0
item_category_id             0
first_sale_month             0
item_avg_price               0
date_item_avg_price     218553
type                         0
subtype                      0
dtype: int64

> 결측치 처리 방향

- `item_cnt_month`, `transaction_cnt`, `date_revenue`는 판매 유무로 인해 nan으로 표시된 걸로 추정 -> 결측치를 0으로 대체
- `mean_item_price` 는 `date_item_avg_price` 으로 대체
- 상기 4개 변수 중 타겟값인 `item_cnt_month` 제외 나머지 변수는 lag 변수 생성 후 삭제
- `date_item_avg_price` 는 월별 아이템카테고리 평균가격으로 대체

In [16]:
fill_zero_cols = ['item_cnt_month', 'transaction_cnt', 'date_revenue']
all_data[fill_zero_cols] = all_data[fill_zero_cols].fillna(0)

cat_date_price = all_data.groupby(['date_block_num', 'item_category_id'])['date_item_avg_price'].mean().reset_index()
cat_date_price.columns = ['date_block_num', 'item_category_id', 'date_cat_avg_price']
all_data = all_data.merge(cat_date_price, on=['date_block_num', 'item_category_id'], how='left')
all_data['date_item_avg_price'] = all_data['date_item_avg_price'].fillna(all_data['date_cat_avg_price'])

all_data['mean_item_price'] = all_data['mean_item_price'].fillna(all_data['date_item_avg_price'])

all_data = all_data.drop(columns=['date_cat_avg_price'])

In [17]:
need_lag_list = []
features_to_drop = []
need_lag_list.extend(['item_cnt_month', 'transaction_cnt', 'mean_item_price', 'item_avg_price', 'date_item_avg_price'])
features_to_drop.append('date_revenue')

In [18]:
del shops, items, item_categories, group, cat_date_price 
all_data = downcast(all_data)

Memory usage reduced from 848.81 MB to 378.18 MB


### mean features

In [19]:
all_data = add_mean_features(all_data, ['date_block_num', 'shop_id'])
all_data = add_mean_features(all_data, ['date_block_num', 'shop_id', 'item_category_id'])

all_data = add_mean_features(all_data, ['date_block_num', 'item_id'])

all_data = add_mean_features(all_data, ['date_block_num', 'item_category_id'])
all_data = add_mean_features(all_data, ['date_block_num', 'subtype'])

In [20]:
mean_feature_list = [col for col in all_data.columns if '_avg_date_sales' in col]
mean_feature_list

['shop_avg_date_sales',
 'shop_item_category_avg_date_sales',
 'item_avg_date_sales',
 'item_category_avg_date_sales',
 'subtype_avg_date_sales']

In [21]:
need_lag_list.extend(mean_feature_list)

### lag features

In [22]:
need_lag_list

['item_cnt_month',
 'transaction_cnt',
 'mean_item_price',
 'item_avg_price',
 'date_item_avg_price',
 'shop_avg_date_sales',
 'shop_item_category_avg_date_sales',
 'item_avg_date_sales',
 'item_category_avg_date_sales',
 'subtype_avg_date_sales']

In [23]:
%%time
lag_period = [1,2,3]
clip_list = []
lag_features_to_process = {}

lag_features_to_process['item_cnt_month'] = True
for lag_feature in need_lag_list:
    if lag_feature == 'item_cnt_month':
        continue
    lag_features_to_process[lag_feature] = False

all_data, clip_list = add_lag_features(all_data, 
                                        key_features=idx_features,
                                        lag_features_info=lag_features_to_process,
                                        lag_period=lag_period)

all_data = downcast(all_data)

features_to_drop.extend(need_lag_list)
features_to_drop.remove('item_cnt_month')

Memory usage reduced from 2058.98 MB to 1857.29 MB
CPU times: user 10.3 s, sys: 3.15 s, total: 13.4 s
Wall time: 13.4 s


### ratio features

In [24]:
epsilon = 1e-7

# add average of lagged sales
sales_lag_cols = [col for col in all_data.columns if 'item_cnt_month_lag_' in col]
all_data['avg_item_cnt_month_lag_mean'] = all_data[sales_lag_cols].mean(axis=1)
all_data[clip_list + ['item_cnt_month', 'avg_item_cnt_month_lag_mean']] = all_data[clip_list + ['item_cnt_month', 'avg_item_cnt_month_lag_mean']].clip(0, 20)

# 판매량 ratio
for col in sales_lag_cols:
    lag_num = col.split('_')[-1] 
    col_name = f'item_cnt_ratio_lag_{lag_num}'
    all_data[col_name] = all_data[col] / (all_data['avg_item_cnt_month_lag_mean'] + epsilon)
    all_data[col_name] = all_data[col_name].replace([np.inf, -np.inf], 0).fillna(0)
    all_data[col_name] = all_data[col_name].clip(0, 10)

# add average of lagged prices
price_lag_cols = [col for col in all_data.columns if 'avg_item_price_lag_' in col]
all_data['avg_item_price_lag_mean'] = all_data[price_lag_cols].mean(axis=1)

# 가격 ratio
for col in price_lag_cols:
    lag_num = col.split('_')[-1] 
    col_name = f'price_ratio_lag_{lag_num}'
    all_data[col_name] = all_data[col] / (all_data['avg_item_price_lag_mean'] + epsilon)
    all_data[col_name] = all_data[col_name].replace([np.inf, -np.inf], 0).fillna(0)
    all_data[col_name] = all_data[col_name].clip(0, 3)

In [25]:
# add average of lagged mean features
for mean_feat in mean_feature_list:
    lag_cols = [col for col in all_data.columns if f'{mean_feat}_lag_' in col]
    feat_name = f'avg_{mean_feat}_for_lag'
    all_data[feat_name] = all_data[lag_cols].mean(axis=1)

all_data = downcast(all_data)

Memory usage reduced from 2227.06 MB to 2185.04 MB


### trend features - price, revenue

In [26]:
# delta price lag
for i in lag_period:
    all_data[f'delta_price_lag_{i}'] = (all_data[f'item_avg_price_lag_{i}'] - all_data['item_avg_price']) / all_data['item_avg_price']

all_data['delta_price_lag'] = all_data['delta_price_lag_1']
all_data['delta_price_lag'] = all_data['delta_price_lag'].fillna(all_data['delta_price_lag_2'])
all_data['delta_price_lag'] = all_data['delta_price_lag'].fillna(all_data['delta_price_lag_3'])
all_data['delta_price_lag'] = all_data['delta_price_lag'].fillna(0)

features_to_drop.extend([f'delta_price_lag_{i}' for i in lag_period])
all_data = downcast(all_data)

Memory usage reduced from 2453.97 MB to 2294.30 MB


In [27]:
# add shop_date_avg_revenue
shop_date_rev = all_data.groupby(['date_block_num', 'shop_id'])['date_revenue'].mean().reset_index()
shop_date_rev.columns = ['date_block_num', 'shop_id', 'shop_date_avg_revenue']

all_data = all_data.merge(shop_date_rev, on=['date_block_num', 'shop_id'], how='left')
features_to_drop.append('shop_date_avg_revenue')

In [28]:
# add shop_avg_revenue
shop_avg_revenue = all_data.groupby('shop_id')['shop_date_avg_revenue'].mean().reset_index()
shop_avg_revenue.columns = ['shop_id', 'shop_revenue_mean']
all_data = all_data.merge(shop_avg_revenue, on='shop_id', how='left')

features_to_drop.append('shop_revenue_mean')
del shop_date_rev, shop_avg_revenue

In [29]:
# add lag features for shop_date_avg_revenue
lag_features_to_process = {'shop_date_avg_revenue': False}
all_data, clip_list = add_lag_features(all_data, 
                                        key_features=idx_features,
                                        lag_features_info=lag_features_to_process,
                                        lag_period=lag_period)

all_data = downcast(all_data)

Memory usage reduced from 2630.46 MB to 2462.38 MB


In [30]:
# step3. delta revenue
for i in lag_period:
    all_data[f'delta_shop_revenue_lag_{i}'] = (all_data[f'shop_date_avg_revenue_lag_{i}'] - all_data['shop_revenue_mean']) / all_data['shop_revenue_mean']

all_data['delta_shop_revenue_lag'] = all_data['shop_date_avg_revenue_lag_1']
all_data['delta_shop_revenue_lag'] = all_data['delta_shop_revenue_lag'].fillna(all_data['shop_date_avg_revenue_lag_2'])
all_data['delta_shop_revenue_lag'] = all_data['delta_shop_revenue_lag'].fillna(all_data['shop_date_avg_revenue_lag_3'])
all_data['delta_shop_revenue_lag'] = all_data['delta_shop_revenue_lag'].fillna(0)

features_to_drop.extend([f'delta_shop_revenue_lag_{i}' for i in lag_period])
features_to_drop.extend([f'shop_date_avg_revenue_lag_{i}' for i in lag_period])

### new item

In [31]:
all_data['new_item'] = all_data['first_sale_month'] == all_data['date_block_num']
all_data['item_age'] = all_data['date_block_num'] - all_data['first_sale_month']
features_to_drop.append('first_sale_month')

### since last sale date

In [32]:
all_data['temp_last_sale'] = np.nan
all_data.loc[all_data['item_cnt_month']>0, 'temp_last_sale'] = all_data['date_block_num']

last_sale_record = all_data.groupby(['item_id', 'shop_id'])['temp_last_sale'].shift(1).ffill()

all_data['since_last_sale'] = (all_data['date_block_num'] - last_sale_record).fillna(-1)
all_data = all_data.drop(columns='temp_last_sale')

### days & month

In [33]:
all_data['month'] = all_data['date_block_num'] % 12
days = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
all_data['days'] = all_data['month'].map(days)

In [34]:
# seasonal average features
all_data['month_avg_sales'] = np.nan
all_data['month_avg_price'] = np.nan

for month_t in all_data['date_block_num'].unique():
    mask = all_data[all_data['date_block_num'] < month_t]
    if not mask.empty:
        group = mask.groupby(['month', 'item_category_id']).agg(
            month_avg_sales=('item_cnt_month', 'mean'),
            month_avg_price=('mean_item_price', 'mean')
        ).reset_index()
        
        group['key'] = group['month'].astype(str) + '_' + group['item_category_id'].astype(str)
        sales_map_dict = dict(zip(group['key'], group['month_avg_sales']))
        price_map_dict = dict(zip(group['key'], group['month_avg_price']))
        
        current_month = (all_data['date_block_num'] == month_t)
        current_month_keys = all_data.loc[current_month, 'month'].astype(str) + '_' + all_data.loc[current_month, 'item_category_id'].astype(str)
        
        all_data.loc[current_month, 'month_avg_sales'] = current_month_keys.map(sales_map_dict)
        all_data.loc[current_month, 'month_avg_price'] = current_month_keys.map(price_map_dict)

all_data['month_avg_price'] = all_data['month_avg_price'].fillna(all_data['month_avg_price'].mean())
all_data['month_avg_sales'] = all_data['month_avg_sales'].fillna(all_data['month_avg_sales'].mean())

all_data = downcast(all_data)

Memory usage reduced from 2890.98 MB to 2739.71 MB


### removing columns

In [35]:
features_to_drop

['date_revenue',
 'transaction_cnt',
 'mean_item_price',
 'item_avg_price',
 'date_item_avg_price',
 'shop_avg_date_sales',
 'shop_item_category_avg_date_sales',
 'item_avg_date_sales',
 'item_category_avg_date_sales',
 'subtype_avg_date_sales',
 'delta_price_lag_1',
 'delta_price_lag_2',
 'delta_price_lag_3',
 'shop_date_avg_revenue',
 'shop_revenue_mean',
 'delta_shop_revenue_lag_1',
 'delta_shop_revenue_lag_2',
 'delta_shop_revenue_lag_3',
 'shop_date_avg_revenue_lag_1',
 'shop_date_avg_revenue_lag_2',
 'shop_date_avg_revenue_lag_3',
 'first_sale_month']

In [36]:
all_data = all_data.drop(columns=features_to_drop)
all_data.columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_cnt_month', 'city',
       'shop_type', 'item_category_id', 'type', 'subtype',
       'item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3',
       'transaction_cnt_lag_1', 'transaction_cnt_lag_2',
       'transaction_cnt_lag_3', 'mean_item_price_lag_1',
       'mean_item_price_lag_2', 'mean_item_price_lag_3',
       'item_avg_price_lag_1', 'item_avg_price_lag_2', 'item_avg_price_lag_3',
       'date_item_avg_price_lag_1', 'date_item_avg_price_lag_2',
       'date_item_avg_price_lag_3', 'shop_avg_date_sales_lag_1',
       'shop_avg_date_sales_lag_2', 'shop_avg_date_sales_lag_3',
       'shop_item_category_avg_date_sales_lag_1',
       'shop_item_category_avg_date_sales_lag_2',
       'shop_item_category_avg_date_sales_lag_3', 'item_avg_date_sales_lag_1',
       'item_avg_date_sales_lag_2', 'item_avg_date_sales_lag_3',
       'item_category_avg_date_sales_lag_1',
       'item_category_avg_date_sales_lag_2',
       'it

In [37]:
print(all_data.shape)

(8812244, 58)


## drop rows with `date_block_num` < 3

In [38]:
all_data = all_data[all_data['date_block_num']>=3].fillna(0)
print(all_data.shape)
all_data = downcast(all_data)

(8026950, 58)
Memory usage reduced from 1806.60 MB to 1645.85 MB


In [39]:
joblib.dump(all_data, data_path + 'all_data2.joblib')

['./data/all_data2.joblib']