In [1]:
import numpy as np
import pandas as pd 
import os
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

In [2]:
combined_data_fname = '../../original_data/Revised By ASIN-Datail Page Sales and Traffic by Parent Item.csv'
df_asin = pd.read_csv(combined_data_fname)
df_asin.head()

Unnamed: 0,(Parent) ASIN,Title,Sessions,Session Percentage,Page Views,Page Views Percentage,Buy Box Percentage,Units Ordered,Unit Session Percentage,Ordered Product Sales,Total Order Items
0,B00CWU48VI,Isolation Vibration m6 Rubber Mount Replacemen...,764,0.16%,1164,0.18%,83%,177,23.17%,"$1,514.48",149
1,B00CWU4DTA,Isolation Vibration m6 Rubber Mount Replacemen...,973,0.20%,1585,0.25%,44%,70,7.19%,$261.89,62
2,B005B00EVY,HPS HTHH-062-BLK Silicone High Temperature Rei...,3038,0.64%,4166,0.66%,15%,47,1.55%,$515.95,33
3,B004NYCI2C,HPS HTSC-200-L4-BLK Silicone High Temperature ...,2351,0.49%,3418,0.54%,9%,58,2.47%,$648.23,30
4,B00ATI059E,"Black 3"" Rubber Coupler hose for shortram cold...",424,0.09%,604,0.10%,88%,51,12.03%,$134.41,28


#### 'B00CWU48VI' is the product that sells the most. So let us explore the sales over time. Load in the monthly data to extract features related to this product. 

#### We want to split the data in half for traning and validation 

In [3]:
path = '../../original_data/monthly/'
file_lists = os.listdir(path)
file_lists.sort() # sort the files by date. 

In [4]:
frames1 = [] # before 08/2019 (price change)
frames2 = [] # before 08/2019 (price change)


for i,fname in enumerate(file_lists):

    tmp_df = pd.read_csv(path + fname)

    year, month = fname[:4],fname[4:6]
    tmp_df['date'] = year + '-' + month
    tmp_df['time_index'] = i
    if i <15: 
        frames1.append(tmp_df)
    else:
        frames2.append(tmp_df)
        
df1 = pd.concat(frames1)
df2 = pd.concat(frames2)

In [5]:
df1.head()

Unnamed: 0,(Parent) ASIN,Title,Sessions,Session Percentage,Page Views,Page Views Percentage,Buy Box Percentage,Units Ordered,Unit Session Percentage,Ordered Product Sales,Total Order Items,date,time_index
0,B00CWU48VI,Isolation Vibration m6 Rubber Mount Replacemen...,37,0.12%,49,0.14%,82%,11,29.73%,$91.15,9,2018-06,0
1,B004JYVMQE,00-06 Chevy Avalanche 5.3L V8 Air Intake Kit +...,88,0.30%,111,0.31%,100%,4,4.55%,$376.60,4,2018-06,0
2,B00AAL1GRU,,60,0.20%,71,0.20%,100%,4,6.67%,$92.08,4,2018-06,0
3,B006856A6E,,42,0.14%,52,0.15%,98%,3,7.14%,$286.35,3,2018-06,0
4,B006ZA9RU8,04-05 MITSUBISHI Lancer Ralliart Style Front G...,52,0.17%,60,0.17%,88%,3,5.77%,$77.55,3,2018-06,0


In [6]:
df2.head()

Unnamed: 0,(Parent) ASIN,Title,Sessions,Session Percentage,Page Views,Page Views Percentage,Buy Box Percentage,Units Ordered,Unit Session Percentage,Ordered Product Sales,Total Order Items,date,time_index
0,B00CWU48VI,Isolation Vibration m6 Rubber Mount Replacemen...,20,0.15%,40,0.21%,88%,16,80.00%,$140.70,6,2019-09,15
1,B005B00F4U,HPS HTHH-075-BLK Silicone High Temperature Rei...,167,1.23%,221,1.17%,19%,6,3.59%,$81.00,3,2019-09,15
2,B00AYIH63W,AEM High Flow In-Tank Fuel Pump 320 lph for 89...,19,0.14%,25,0.13%,48%,3,15.79%,$306.10,3,2019-09,15
3,B00CXJN28I,,16,0.12%,29,0.15%,86%,2,12.50%,$198.00,2,2019-09,15
4,B00VWY4V08,HPS 27-514BL-2 Blue Short Ram Air Intake Kit (...,3,0.02%,3,0.02%,100%,2,66.67%,$336.89,2,2019-09,15


### Select the top product from both dataframe

In [7]:
df_prod1 = df1.loc[df1['(Parent) ASIN'] == 'B00CWU48VI']
df_prod2 = df2.loc[df2['(Parent) ASIN'] == 'B00CWU48VI']
del df_prod1['Title']
del df_prod2['Title']
df_prod1

# to-do: simply add date to a column. 

Unnamed: 0,(Parent) ASIN,Sessions,Session Percentage,Page Views,Page Views Percentage,Buy Box Percentage,Units Ordered,Unit Session Percentage,Ordered Product Sales,Total Order Items,date,time_index
0,B00CWU48VI,37,0.12%,49,0.14%,82%,11,29.73%,$91.15,9,2018-06,0
0,B00CWU48VI,50,0.14%,63,0.15%,87%,11,22.00%,$94.43,7,2018-07,1
0,B00CWU48VI,69,0.17%,94,0.19%,95%,13,18.84%,$108.20,11,2018-08,2
1,B00CWU48VI,32,0.11%,44,0.12%,89%,6,18.75%,$51.15,5,2018-09,3
0,B00CWU48VI,24,0.09%,39,0.11%,97%,9,37.50%,$74.85,9,2018-10,4
1,B00CWU48VI,30,0.12%,47,0.15%,79%,5,16.67%,$42.98,5,2018-11,5
24,B00CWU48VI,31,0.14%,45,0.15%,31%,1,3.23%,$8.15,1,2018-12,6
0,B00CWU48VI,42,0.20%,71,0.25%,61%,12,28.57%,$101.62,12,2019-01,7
3,B00CWU48VI,26,0.14%,34,0.14%,94%,3,11.54%,$25.20,3,2019-02,8
0,B00CWU48VI,34,0.16%,56,0.19%,89%,8,23.53%,$66.47,8,2019-03,9


#### Clean up data and output to csv 

In [8]:
def clean_df(df):
    """
    clean up columns; convert to float. 
    """
    #fname = product + '.csv'
    #df = pd.read_csv(fname)
    #del df['Unnamed: 0']
    #del df['Total Order Items']
    #del df['Ordered Product Sales']
    
    df['Sessions'] = df['Sessions'].astype('float')
    df['Session Percentage'] = df['Session Percentage'].str.rstrip('%').astype('float') 

    df['Page Views'] = df['Page Views'].astype('float')
    df['Page Views Percentage'] = df['Page Views Percentage'].str.rstrip('%').astype('float') 

    df['Buy Box Percentage'] = df['Buy Box Percentage'].str.rstrip('%').astype('float')

    df['Units Ordered']= df['Units Ordered'].astype('float')
    df['Unit Session Percentage'] = df['Unit Session Percentage'].str.rstrip('%').astype('float')

    df['Ordered Product Sales'] = df['Ordered Product Sales'].str.replace('$','').astype('float')
    df['Total Order Items'] = df['Total Order Items'].astype('float')
    
    df['time_index'] = df['time_index'].astype('float')
    return df

In [9]:
df_prod1_clean = clean_df(df_prod1)
df_prod2_clean = clean_df(df_prod2)
df_prod1.head()
df_prod2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

Se

Unnamed: 0,(Parent) ASIN,Sessions,Session Percentage,Page Views,Page Views Percentage,Buy Box Percentage,Units Ordered,Unit Session Percentage,Ordered Product Sales,Total Order Items,date,time_index
0,B00CWU48VI,20.0,0.15,40.0,0.21,88.0,16.0,80.0,140.7,6.0,2019-09,15.0
0,B00CWU48VI,30.0,0.22,45.0,0.24,84.0,12.0,40.0,104.61,10.0,2019-10,16.0
0,B00CWU48VI,26.0,0.17,36.0,0.18,86.0,7.0,26.92,59.57,7.0,2019-11,17.0
2,B00CWU48VI,19.0,0.15,27.0,0.16,93.0,3.0,15.79,25.29,2.0,2019-12,18.0
0,B00CWU48VI,31.0,0.23,38.0,0.21,97.0,7.0,22.58,60.73,6.0,2020-01,19.0


### Save to file

In [10]:
df_prod1.to_csv('../../cleaned_data/B00CWU48VI_y1.csv')
df_prod2.to_csv('../../cleaned_data/B00CWU48VI_y2.csv')

In [11]:
df_prod1

Unnamed: 0,(Parent) ASIN,Sessions,Session Percentage,Page Views,Page Views Percentage,Buy Box Percentage,Units Ordered,Unit Session Percentage,Ordered Product Sales,Total Order Items,date,time_index
0,B00CWU48VI,37.0,0.12,49.0,0.14,82.0,11.0,29.73,91.15,9.0,2018-06,0.0
0,B00CWU48VI,50.0,0.14,63.0,0.15,87.0,11.0,22.0,94.43,7.0,2018-07,1.0
0,B00CWU48VI,69.0,0.17,94.0,0.19,95.0,13.0,18.84,108.2,11.0,2018-08,2.0
1,B00CWU48VI,32.0,0.11,44.0,0.12,89.0,6.0,18.75,51.15,5.0,2018-09,3.0
0,B00CWU48VI,24.0,0.09,39.0,0.11,97.0,9.0,37.5,74.85,9.0,2018-10,4.0
1,B00CWU48VI,30.0,0.12,47.0,0.15,79.0,5.0,16.67,42.98,5.0,2018-11,5.0
24,B00CWU48VI,31.0,0.14,45.0,0.15,31.0,1.0,3.23,8.15,1.0,2018-12,6.0
0,B00CWU48VI,42.0,0.2,71.0,0.25,61.0,12.0,28.57,101.62,12.0,2019-01,7.0
3,B00CWU48VI,26.0,0.14,34.0,0.14,94.0,3.0,11.54,25.2,3.0,2019-02,8.0
0,B00CWU48VI,34.0,0.16,56.0,0.19,89.0,8.0,23.53,66.47,8.0,2019-03,9.0
