In [1]:
import pandas as pd
import numpy as np


from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

In [2]:
train = pd.read_csv('train.tsv',delimiter='\t')
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [3]:
test = pd.read_csv('test.tsv',delimiter='\t')
test.head()

Unnamed: 0,test_id,name,item_condition_id,category_name,brand_name,shipping,item_description
0,0,"Breast cancer ""I fight like a girl"" ring",1,Women/Jewelry/Rings,,1,Size 7
1,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers",1,Other/Office supplies/Shipping Supplies,,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers Lined..."
2,2,Coach bag,1,Vintage & Collectibles/Bags and Purses/Handbag,Coach,1,Brand new coach bag. Bought for [rm] at a Coac...
3,3,Floral Kimono,2,Women/Sweaters/Cardigan,,0,-floral kimono -never worn -lightweight and pe...
4,4,Life after Death,3,Other/Books/Religion & Spirituality,,1,Rediscovering life after the loss of a loved o...


In [4]:
train.shape

(1482535, 8)

In [5]:
test.shape

(693359, 7)

In [6]:
train.tail()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
1482530,1482530,Free People Inspired Dress,2,Women/Dresses/Mid-Calf,Free People,20.0,1,"Lace, says size small but fits medium perfectl..."
1482531,1482531,Little mermaid handmade dress,2,Kids/Girls 2T-5T/Dresses,Disney,14.0,0,Little mermaid handmade dress never worn size 2t
1482532,1482532,21 day fix containers and eating plan,2,Sports & Outdoors/Exercise/Fitness accessories,,12.0,0,"Used once or twice, still in great shape."
1482533,1482533,World markets lanterns,3,Home/Home Décor/Home Décor Accents,,45.0,1,There is 2 of each one that you see! So 2 red ...
1482534,1482534,Brand new lux de ville wallet,1,Women/Women's Accessories/Wallets,,22.0,0,"New with tag, red with sparkle. Firm price, no..."


In [7]:
test.tail()

Unnamed: 0,test_id,name,item_condition_id,category_name,brand_name,shipping,item_description
693354,693354,Quartz crystal on Flint stone,1,Home/Home Décor/Home Décor Accents,,0,Flint/Quartz cluster. Self mined ✨measures 3x2...
693355,693355,It Cosmetics - Travel Bundle,1,Beauty/Makeup/Makeup Sets,IT Cosmetics,1,It Cosmetics travel bundle. Includes: Brow pow...
693356,693356,Galaxy S8 hard shell case,1,"Electronics/Cell Phones & Accessories/Cases, C...",,1,New. Free shipping Basstop case
693357,693357,Hi low floral kimono,2,Women/Swimwear/Cover-Ups,,0,Floral kimono. Tropical print. Open front. Hi ...
693358,693358,"FREESHIP 2 Floral Scrub Tops, medium.",2,Women/Tops & Blouses/T-Shirts,,1,2 Floral scrub tops. Worn less than 5 times ea...


In [8]:
train.isnull().sum()

train_id                  0
name                      0
item_condition_id         0
category_name          6327
brand_name           632682
price                     0
shipping                  0
item_description          4
dtype: int64

In [9]:
test.isnull().sum()

test_id                   0
name                      0
item_condition_id         0
category_name          3058
brand_name           295525
shipping                  0
item_description          0
dtype: int64

In [10]:
train.set_index('train_id',inplace=True)
test.set_index('test_id',inplace=True)

In [11]:
train.fillna('None',inplace=True)
test.fillna('None',inplace=True)

### Handling Brands

In [12]:
brand_train = train['brand_name'].unique().tolist()
brand_test = test['brand_name'].unique().tolist()

In [13]:
nonbrand = list()
for val in brand_test:
    if val not in brand_train:
        nonbrand.append(val)

In [14]:
for val in nonbrand:
    brand_test.remove(val)

In [15]:
train['brand_mean_price'] = 0.0
train['brand_std_dev'] = 0.0

In [16]:
for val in brand_train:
    temp = train[train['brand_name']==val]
    avg = temp['price'].mean()
    std = temp['price'].std()
    for i,row in temp.iterrows():
        train.at[i,'brand_mean_price'] = avg
        train.at[i,'brand_std_dev'] = std

In [17]:
test['brand_mean_price'] = 0.0
test['brand_std_dev'] = 0.0

In [18]:
for val in brand_test:
    temp = test[test['brand_name']==val]
    data = train[train['brand_name']==val]
    avg = data.iloc[0]['brand_mean_price']
    std = data.iloc[0]['brand_std_dev']
    for i,row in temp.iterrows():
        test.at[i,'brand_mean_price'] = avg
        test.at[i,'brand_std_dev'] = std

In [19]:
train.fillna(0.0,inplace=True)

In [20]:
test.fillna(0.0,inplace=True)

In [21]:
train['branded'] = 1

In [22]:
for i,row in train.iterrows():
    if row['brand_name'] == 'None':
        train.at[i,'branded'] = 0

In [23]:
test['branded'] = 1

In [24]:
for i,row in test.iterrows():
    if row['brand_name'] == 'None':
        test.at[i,'branded'] = 0

In [25]:
train['upper_price_limit'] = train['brand_mean_price'] + train['brand_std_dev']
test['upper_price_limit'] = test['brand_mean_price'] + test['brand_std_dev']

In [26]:
train['lower_price_limit'] = abs(train['brand_mean_price'] - train['brand_std_dev'])
test['lower_price_limit'] = abs(test['brand_mean_price'] - test['brand_std_dev'])

In [27]:
train_list = train.columns.tolist()
train_list

['name',
 'item_condition_id',
 'category_name',
 'brand_name',
 'price',
 'shipping',
 'item_description',
 'brand_mean_price',
 'brand_std_dev',
 'branded',
 'upper_price_limit',
 'lower_price_limit']

In [28]:
train_price = train[['item_condition_id','price','shipping','brand_mean_price','brand_std_dev','branded','upper_price_limit','lower_price_limit']]

In [29]:
test_list = test.columns.tolist()
test_list

['name',
 'item_condition_id',
 'category_name',
 'brand_name',
 'shipping',
 'item_description',
 'brand_mean_price',
 'brand_std_dev',
 'branded',
 'upper_price_limit',
 'lower_price_limit']

In [30]:
test_price = test[['item_condition_id','shipping','brand_mean_price','brand_std_dev','branded','upper_price_limit','lower_price_limit']]

In [31]:
X_cols = train_price.columns.tolist()
X_cols.remove('price')
X = train_price[X_cols]
y = train_price['price']

In [32]:
rfr_s = RandomForestRegressor(min_samples_leaf=50, n_estimators=50, n_jobs=-1)
rfr_s.fit(X,y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=50, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [33]:
sub_f = pd.read_csv('sample_submission.csv')

In [34]:
sub_f['price'] = rfr_s.predict(test_price)

In [35]:
sub_f.to_csv('rf_tp_v1.csv', index=False)

item_condition_id    0
price                0
shipping             0
brand_mean_price     0
brand_std_dev        0
branded              0
upper_price_limit    0
lower_price_limit    0
dtype: int64