In [1]:
import os, sys
import re
import pandas as pd
import numpy as np
import datetime
import dateutil.relativedelta

In [2]:
train_data_path = '/opt/ml/code/input/train.csv'
train_df = pd.read_csv(train_data_path, parse_dates=['order_date'])

In [3]:
train_df.head()

Unnamed: 0,order_id,product_id,description,quantity,order_date,price,customer_id,country,total
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,11.4675,13085,United Kingdom,137.61
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,11.1375,13085,United Kingdom,133.65
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,11.1375,13085,United Kingdom,133.65
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,3.465,13085,United Kingdom,166.32
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,2.0625,13085,United Kingdom,49.5


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 780502 entries, 0 to 780501
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   order_id     780502 non-null  object        
 1   product_id   780502 non-null  object        
 2   description  780502 non-null  object        
 3   quantity     780502 non-null  int64         
 4   order_date   780502 non-null  datetime64[ns]
 5   price        780502 non-null  float64       
 6   customer_id  780502 non-null  int64         
 7   country      780502 non-null  object        
 8   total        780502 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(2), object(4)
memory usage: 53.6+ MB


In [7]:
tiers = [chr(65+i) for i in range(5)]
train = train_df.copy()
total_sum = train.groupby('customer_id').total.sum().to_frame(name="total-sum")
total_sum.reset_index()
tier_bin = pd.qcut(x=total_sum['total-sum'], q=5, labels=tiers)
tier_bin = tier_bin.to_frame(name="tier")
# tier_bin = tier_bin.reset_index()
# tier_bin.rename(columns = {'total-sum':'tier'}, inplace=True)
train = train.merge(tier_bin, on=['customer_id'])
print(train)

       order_id product_id                          description  quantity  \
0        489434      85048  15CM CHRISTMAS GLASS BALL 20 LIGHTS        12   
1        489434     79323P                   PINK CHERRY LIGHTS        12   
2        489434     79323W                  WHITE CHERRY LIGHTS        12   
3        489434      22041         RECORD FRAME 7" SINGLE SIZE         48   
4        489434      21232       STRAWBERRY CERAMIC TRINKET BOX        24   
...         ...        ...                                  ...       ...   
780497   579754     84997C      CHILDRENS CUTLERY POLKADOT BLUE         1   
780498   579754     84997B     CHILDRENS CUTLERY RETROSPOT RED          1   
780499   579754     84997A    CHILDRENS CUTLERY POLKADOT GREEN          2   
780500   579754      23353       6 GIFT TAGS VINTAGE CHRISTMAS         36   
780501   579754      22141       CHRISTMAS CRAFT TREE TOP ANGEL         6   

                order_date    price  customer_id         country     total 

In [37]:
first_column = '2009-12-01'
d = datetime.datetime.strptime(first_column, '%Y-%m-%d')
dates = []
dates.append(d)
delta = dateutil.relativedelta.relativedelta(days = 1)
for i in range(7):
    d = d + delta
    dates.append(d)

data = {'order_date': dates}
t_df = pd.DataFrame(data)

print(t_df)
print(type(t_df))

  order_date
0 2009-12-01
1 2009-12-02
2 2009-12-03
3 2009-12-04
4 2009-12-05
5 2009-12-06
6 2009-12-07
7 2009-12-08
<class 'pandas.core.frame.DataFrame'>


In [41]:
t_df.describe(include='all')

  """Entry point for launching an IPython kernel.


Unnamed: 0,order_date
count,8
unique,8
top,2009-12-08 00:00:00
freq,1
first,2009-12-01 00:00:00
last,2009-12-08 00:00:00


In [30]:
d = datetime.datetime.strptime('2019-12-01', "%Y-%m-%d")
prev_ym = d - dateutil.relativedelta.relativedelta(days=1)
prev_ym = prev_ym.strftime('%Y-%m-%d')
print(type(prev_ym))

<class 'str'>


# Year-month 로 피처 생성 (시계열 특성)

## 필요 함수 추가 (generate_label)

In [13]:
TOTAL_THRES = 300

'''
    입력인자로 받는 year_month에 대해 고객 ID별로 총 구매액이
    구매액 임계값을 넘는지 여부의 binary label을 생성하는 함수
'''
def generate_label(df, year_month, total_thres=TOTAL_THRES, print_log=False):
    df = df.copy()
    
    # year_month에 해당하는 label 데이터 생성
    df['year_month'] = df['order_date'].dt.strftime('%Y-%m')
    df.reset_index(drop=True, inplace=True)

    # year_month 이전 월의 고객 ID 추출
    cust = df[df['year_month']<year_month]['customer_id'].unique()
    # year_month에 해당하는 데이터 선택
    df = df[df['year_month']==year_month]
    
    # label 데이터프레임 생성
    label = pd.DataFrame({'customer_id':cust})
    label['year_month'] = year_month
    
    # year_month에 해당하는 고객 ID의 구매액의 합 계산
    grped = df.groupby(['customer_id','year_month'], as_index=False)[['total']].sum()
    print(grped.head(15))
    # label 데이터프레임과 merge하고 구매액 임계값을 넘었는지 여부로 label 생성
    label = label.merge(grped, on=['customer_id','year_month'], how='left')
    label['total'].fillna(0.0, inplace=True)
    label['label'] = (label['total'] > total_thres).astype(int)
    print(label.head(15))

    # 고객 ID로 정렬
    label = label.sort_values('customer_id').reset_index(drop=True)
    if print_log: print(f'{year_month} - final label shape: {label.shape}')
    
    return label

In [18]:
train['month'] = train['order_date'].dt.month
train['year_month'] = train['order_date'].dt.strftime('%Y-%m')

year_month = '2011-11'
df_label = generate_label(train, year_month)[['customer_id', 'year_month', 'label']]
df = train.copy()
df = df[df['order_date'] < year_month]

cols = ['month', 'year_month']

# customer_id 기준으로 pandas group by 후 aggregation feature 생성
df_agg = df.groupby(['customer_id'])[cols].agg([lambda x:x.value_counts().index[0]])

df_agg.columns = ['month-mode', 'year-month-mode']

# label 데이터와 feature merge
df_all = df_label.merge(df_agg, on=['customer_id'], how='left')
df_all.head(10)

    customer_id year_month       total
0         12349    2011-11   2899.9575
1         12352    2011-11    514.3545
2         12356    2011-11     96.2775
3         12357    2011-11  10242.6555
4         12362    2011-11    707.9985
5         12374    2011-11   1225.8345
6         12375    2011-11    374.8800
7         12380    2011-11   1716.6435
8         12381    2011-11    691.4490
9         12384    2011-11    482.7570
10        12388    2011-11    472.5600
11        12391    2011-11    725.4390
12        12395    2011-11   1270.2195
13        12397    2011-11   1971.8985
14        12406    2011-11   2960.1825
    customer_id year_month       total  label
0         13085    2011-11      0.0000      0
1         13078    2011-11   3190.8360      1
2         15362    2011-11      0.0000      0
3         18102    2011-11  25296.2820      1
4         12682    2011-11   3988.7100      1
5         18087    2011-11      0.0000      0
6         13635    2011-11      0.0000      0
7       

Unnamed: 0,customer_id,year_month,label,month-mode,year-month-mode
0,12346,2011-11,0,6,2010-06
1,12347,2011-11,0,10,2011-10
2,12348,2011-11,0,9,2010-09
3,12349,2011-11,1,10,2010-10
4,12350,2011-11,0,2,2011-02
5,12351,2011-11,0,11,2010-11
6,12352,2011-11,1,3,2011-03
7,12353,2011-11,0,10,2010-10
8,12354,2011-11,0,4,2011-04
9,12355,2011-11,0,5,2010-05


In [36]:
train['order_ts'] = train['order_date'].astype(np.int64)//1e9
train['order_ts_diff'] = train.groupby(['customer_id'])['order_ts'].diff()

year_month = '2011-11'
train_label = generate_label(train, year_month)[['customer_id', 'year_month', 'label']]
train = train[train['order_date'] < year_month]

agg_func = ['mean', 'max', 'min', 'sum', 'count', 'std', 'skew']
agg_dict = {
    'order_ts': ['first', 'last'],
    'order_ts_diff': agg_func,
}

train_agg = train.groupby(['customer_id']).agg(agg_dict)

new_cols = []
for col in agg_dict.keys():
    for stat in agg_dict[col]:
        new_cols.append(f'{col}-{stat}')
train_agg.columns = new_cols

train_all = train_label.merge(train_agg, on=['customer_id'], how='left')


Empty DataFrame
Columns: [customer_id, year_month, total]
Index: []
    customer_id year_month  total  label
0         13085    2011-11    0.0      0
1         13078    2011-11    0.0      0
2         15362    2011-11    0.0      0
3         18102    2011-11    0.0      0
4         12682    2011-11    0.0      0
5         18087    2011-11    0.0      0
6         13635    2011-11    0.0      0
7         14110    2011-11    0.0      0
8         12636    2011-11    0.0      0
9         17519    2011-11    0.0      0
10        13758    2011-11    0.0      0
11        12362    2011-11    0.0      0
12        15413    2011-11    0.0      0
13        16321    2011-11    0.0      0
14        17592    2011-11    0.0      0


In [37]:
train_all.shape

(5722, 12)

In [38]:
train_all.head()

Unnamed: 0,customer_id,year_month,label,order_ts-first,order_ts-last,order_ts_diff-mean,order_ts_diff-max,order_ts_diff-min,order_ts_diff-sum,order_ts_diff-count,order_ts_diff-std,order_ts_diff-skew
0,12346,2011-11,0,1260780000.0,1295346000.0,751438.695652,10197900.0,0.0,34566180.0,46,2323281.0,3.439852
1,12347,2011-11,0,1288535000.0,1320064000.0,150138.571429,7789020.0,0.0,31529100.0,210,911453.5,6.372104
2,12348,2011-11,0,1285600000.0,1316956000.0,627136.8,14955960.0,0.0,31356840.0,50,2475659.0,4.710186
3,12349,2011-11,0,1259931000.0,1288254000.0,267200.377358,14077560.0,0.0,28323240.0,106,1832081.0,7.131391
4,12350,2011-11,0,1296662000.0,1296662000.0,0.0,0.0,0.0,0.0,16,0.0,0.0


In [57]:
def add_trend(train, test, year_month):
    train = train.copy()
    test = test.copy()

    # year_month 이전 월 계산
    d = datetime.datetime.strptime(year_month, "%Y-%m")
    prev_ym_d = d - dateutil.relativedelta.relativedelta(months=1)

    train_window_ym = []
    test_window_ym = [] 
    for month_back in [1, 2, 3, 5, 7, 12, 20, 23]: # 1개월, 2개월, ... 20개월, 23개월 전 year_month 파악
        train_window_ym.append((prev_ym_d - dateutil.relativedelta.relativedelta(months = month_back)).strftime('%Y-%m'))
        test_window_ym.append((d - dateutil.relativedelta.relativedelta(months = month_back)).strftime('%Y-%m'))

    # aggregation 함수 선언
    agg_func = ['max','min','sum','mean','count','std','skew']
    print(train.columns)
    print(test.columns)
    cols = [x for x in train.columns.to_numpy() if x not in ['customer_id', 'label', 'year_month']]
    # group by aggregation with Dictionary
    agg_dict = { col: agg_func for col in cols }

    # general statistics for train data with time series trend
    for i, tr_ym in enumerate(train_window_ym):
        # group by aggretation 함수로 train 데이터 피처 생성
        train_agg = train.loc[(train['year_month'] >= tr_ym)].groupby(['customer_id']).agg(agg_dict) # 해당 year_month 이후부터 모든 데이터에 대한 aggregation을 실시

        # 멀티 레벨 컬럼을 사용하기 쉽게 1 레벨 컬럼명으로 변경
        new_cols = []
        for level1, level2 in train_agg.columns:
            new_cols.append(f'{level1}-{level2}-{i}')

        train_agg.columns = new_cols
        train_agg.reset_index(inplace = True)
        
        if i == 0:
            train_data = train_agg
        else:
            train_data = train_data.merge(train_agg, on=['customer_id'], how='right')


    # general statistics for test data with time series trend
    for i, tr_ym in enumerate(test_window_ym):
        # group by aggretation 함수로 test 데이터 피처 생성
        test_agg = test.loc[test['year_month'] >= tr_ym].groupby(['customer_id']).agg(agg_dict)

        # 멀티 레벨 컬럼을 사용하기 쉽게 1 레벨 컬럼명으로 변경
        new_cols = []
        for level1, level2 in test_agg.columns:
            new_cols.append(f'{level1}-{level2}-{i}')

        test_agg.columns = new_cols
        test_agg.reset_index(inplace = True)
        
        if i == 0:
            test_data = test_agg
        else:
            test_data = test_data.merge(test_agg, on=['customer_id'], how='right')

    return train_data, test_data

In [58]:
def feature_engineering1(df, year_month):
    df = df.copy()

    # year_month 이전 월 계산
    d = datetime.datetime.strptime(year_month, "%Y-%m")
    prev_ym = d - dateutil.relativedelta.relativedelta(months=1)
    prev_ym = prev_ym.strftime('%Y-%m')
    
    # train, test 데이터 선택
    train_1 = df[df['order_date'] < prev_ym]
    test_1 = df[df['order_date'] < year_month]


    train_2 = train_1.copy()
    test_2 = test_1.copy()
    
    train_1['order_ts'] = train_1['order_date'].astype(np.int64)//1e9
    train_1['order_ts_diff'] = train_1.groupby(['customer_id'])['order_ts'].diff()
    test_2['order_ts'] = test_2['order_date'].astype(np.int64)//1e9
    test_2['order_ts_diff'] = test_2.groupby(['customer_id'])['order_ts'].diff()
    
    # train, test 레이블 데이터 생성
    train_label = generate_label(df, prev_ym)[
        ['customer_id', 'year_month', 'label']]
    test_label = generate_label(df, year_month)[
        ['customer_id', 'year_month', 'label']]

    # group by aggregation 함수 선언
    agg_func = ['mean', 'max', 'min', 'sum', 'count', 'std', 'skew']
    agg_dict = {
        'order_ts': ['first', 'last'],
        'order_ts_diff': agg_func,
    }

    train_agg_1 = train_1.groupby(['customer_id']).agg(agg_dict)

    new_cols_1 = []
    for col in agg_dict.keys():
        for stat in agg_dict[col]:
            new_cols_1.append(f'{col}-{stat}')
    train_agg_1.columns = new_cols_1

    train_1 = train_label.merge(train_agg_1, on=['customer_id'], how='left')

    all_train_data = pd.DataFrame()

    for i, tr_ym in enumerate(train_label['year_month'].unique()):
        # group by aggretation 함수로 train 데이터 피처 생성
        train_agg_2 = train_2.loc[train_2['order_date'] < tr_ym].groupby(
            ['customer_id']).agg(agg_func)

        # 멀티 레벨 컬럼을 사용하기 쉽게 1 레벨 컬럼명으로 변경
        new_cols_2 = []
        for col in train_agg_2.columns.levels[0]:
            for stat in train_agg_2.columns.levels[1]:
                new_cols_2.append(f'{col}-{stat}')

        train_agg_2.columns = new_cols_2
        train_agg_2.reset_index(inplace=True)

        train_agg_2['year_month'] = tr_ym

        all_train_data = all_train_data.append(train_agg_2)

    train_2 = train_label.merge(
        all_train_data, on=['customer_id', 'year_month'], how='left')
    
    train_2.drop(['label', 'year_month'], axis=1, inplace=True)
    train_2 = train_2.merge(train_1, on=['customer_id'])

    features = train_2.drop(
        columns=['customer_id', 'label', 'year_month']).columns

    # group by aggretation 함수로 test 데이터 피처 생성
    test_agg_1 = test_1.groupby(['customer_id']).agg(agg_func)
    test_agg_1.columns = new_cols_2

    test_agg_2 = test_2.groupby(['customer_id']).agg(agg_dict)
    test_agg_2.columns = new_cols_1

    test_1 = test_label.merge(test_agg_1, on=['customer_id'], how='left')
    test_2 = test_label.merge(test_agg_2, on=['customer_id'], how='left')
    test_1.drop(['label', 'year_month'], axis=1, inplace=True)
    test_2 = test_2.merge(test_1, on=['customer_id'])

    train_3, test_3 = add_trend(train_2, test_2, year_month)
    train_2, test_2 = add_tier(train_2, test_2)

    train_2 = pd.merge(train_3, on=['customer_id', 'label', 'year_month'], how="right")
    test_2 = pd.merge(test_3, on=['customer_id', 'label', 'year_month'], how="right")
    features.append(pd.Index(['tier']))
    features.append(train_3.drop(columns=['customer_id', 'label', 'year_month']).columns)
    # train, test 데이터 전처리
    x_tr, x_te = feature_preprocessing(train_2, test_2, features)

    print('x_tr.shape', x_tr.shape, ', x_te.shape', x_te.shape)

    return x_tr, x_te, train_2['label'], features

In [56]:
year_month = '2011-11'
train_df = train_df.copy()

# year_month 이전 월 계산
d = datetime.datetime.strptime(year_month, "%Y-%m")
prev_ym = d - dateutil.relativedelta.relativedelta(months=1)
prev_ym = prev_ym.strftime('%Y-%m')

# train, test 데이터 선택
train_1 = train_df[train_df['order_date'] < prev_ym]
train_2 = train_1.copy()
train_1['order_ts'] = train_1['order_date'].astype(np.int64)//1e9
train_1['order_ts_diff'] = train_1.groupby(['customer_id'])['order_ts'].diff()

# train_1, test 레이블 데이터 생성
train_label = generate_label(train_df, prev_ym)[
    ['customer_id', 'year_month', 'label']]

agg_func = ['mean', 'max', 'min', 'sum', 'count', 'std', 'skew']
agg_dict = {
    'order_ts': ['first', 'last'],
    'order_ts_diff': agg_func,
}

train_agg_1 = train_1.groupby(['customer_id']).agg(agg_dict)

new_cols = []
for col in agg_dict.keys():
    for stat in agg_dict[col]:
        new_cols.append(f'{col}-{stat}')
train_agg_1.columns = new_cols

train_1 = train_label.merge(train_agg_1, on=['customer_id'], how='left')

# group by aggregation 함수 선언
# agg_func = ['mean', 'max', 'min', 'sum', 'count', 'std', 'skew']
train_data_2 = pd.DataFrame()

for i, tr_ym in enumerate(train_label['year_month'].unique()):
    # group by aggretation 함수로 train 데이터 피처 생성
    train_agg_2 = train_2.loc[train_2['order_date'] < tr_ym].groupby(
        ['customer_id']).agg(agg_func)

    # 멀티 레벨 컬럼을 사용하기 쉽게 1 레벨 컬럼명으로 변경
    new_cols = []
    for col in train_agg_2.columns.levels[0]:
        for stat in train_agg_2.columns.levels[1]:
            new_cols.append(f'{col}-{stat}')

    train_agg_2.columns = new_cols
    train_agg_2.reset_index(inplace=True)

    train_agg_2['year_month'] = tr_ym

    train_data_2 = train_data_2.append(train_agg_2)

train_data_2 = train_label.merge(
    train_data_2, on=['customer_id', 'year_month'], how='left')
train_data_2.drop(['label', 'year_month'], axis=1, inplace=True)
print(train_1.head(5))
print('\n')
print(train_data_2.head(5))
print('\n')
train_data_2 = train_data_2.merge(train_1, on=['customer_id'])
print(train_data_2.head(15))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
    customer_id year_month      total
0         12347    2011-10  2135.6280
1         12359    2011-10  4643.5950
2         12360    2011-10  1722.2370
3         12362    2011-10  2709.1515
4         12364    2011-10   493.4490
5         12370    2011-10  1219.8120
6         12371    2011-10  3115.1340
7         12380    2011-10   736.9065
8         12381    2011-10    17.3250
9         12394    2011-10  1470.8100
10        1239

In [45]:
features

NameError: name 'features' is not defined

In [22]:
def extract_char(str_list:set) -> set:
    """
        :str_list: set (숫자/숫자+문자가 섞인 문자열(주로 id집합))
        : return :set (숫자 + 문자 문자열에서 문자열 추출)
    """
    char_set = set([])
    for word in str_list:
        str = ""
        for idx in range(len(word)):
            if not word[idx].isdigit():
                str = str + word[idx]
            else:
                str = str + "_"
        if str not in char_set:
            char_set.add(str)
    return char_set

In [24]:
# 한 줄에 출력할 개수 = 5개로 지정
def print_element(myset : set, term=5):
    for idx, pid in enumerate(myset) : 
        if ((idx+1) % term == 0) :
            print(pid)
        else :
            print(pid, end = "\t")

In [26]:
# order_id에서 문자가 포함된 id 추출
order_id_list = train_df['order_id'].tolist()
order_id_char_list = [id for id in order_id_list if not id.isdigit()]
# 문자 포함된 id 에 어떤 글자가 포함되어 있는 지 확인
order_id_char_set = set(order_id_char_list)
char_set = extract_char(order_id_char_set)  #함수 --> id에 문자가 포함된 경우 패턴 파악

print("number of order_id included char =", len(order_id_char_list))
print("number of order_id included char(no duplicated) =", len(order_id_char_set))
#문자확인 -> C 밖에 없다.
print("char_set = ", char_set)

number of order_id included char = 18034
number of order_id included char(no duplicated) = 7758
char_set =  {'C______'}


In [41]:
# product_id에서 문자가 포함된 id 추출
pid_list = train_df['product_id'].tolist()
pid_char_list = [id for id in pid_list if not id.isdigit()]

# 문자 포함된 id 에 어떤 글자가 포함되어 있는 지 확인
pid_char_set = set(pid_char_list)
pid_char_patterns = extract_char(pid_char_set)  #함수 --> id에 문자가 포함된 경우 패턴 파악

print("number of pid included char =", len(pid_char_list))
print("number of pid included char(no duplicated) =", len(pid_char_set))
print("len(pid_patterns) :", len(pid_char_patterns))
print()
#문자확인(5개씩 끊어서 출력)
print_element(pid_char_patterns)

number of pid included char = 90581
number of pid included char(no duplicated) = 1315
len(pid_patterns) : 39

_____K	_____I	PADS	TEST___	_____BL
SP____	_____M	_____S	_____GR	ADJUST
_____T	_____R	_____L	_____H	_____A
POST	_____O	DOT	_____P	CRUK
M	_____N	_____B	_____Y	_____E
_____C	_____W	_____V	C_	_____Z
_____U	_____J	BANK CHARGES	ADJUST_	_____G
D	_____D	_____F	_____LP	

In [29]:
train_df.head(5)

Unnamed: 0,order_id,product_id,description,quantity,order_date,price,customer_id,country,total
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,11.4675,13085,United Kingdom,137.61
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,11.1375,13085,United Kingdom,133.65
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,11.1375,13085,United Kingdom,133.65
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,3.465,13085,United Kingdom,166.32
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,2.0625,13085,United Kingdom,49.5


### hypothesis - Alphabet behind product_id is color.

>memo for hypo

    > one description has many product id
    > two or more discription has one common product id

In [60]:
# print product_id and description
# train_df.loc[:5, ['product_id', 'description']]

pid_with_char = []
pid_indices = []

pid_list = train_df['product_id'].tolist()
pid_char_list = [pid for pid in pid_list if not pid.isdigit()]

reg = re.compile(r'[0-9]+')
print(reg.sub('', '79323P'))

for i, pid_char in enumerate(pid_char_list):
    pid_char_list[i] = reg.sub('', pid_char)
    
data_copy = train_df.copy()
data_copy['product_id', '']
# pid_suffices = [pid for pid in pid_char_list if reg.match(pid)]
        
# train_df.loc[train_df['product_id'].isin(pid_char_list), ['product_id', 'description']]

P


'P'

In [32]:
train_df.loc['product']    

KeyError: 'product'