Let's start with loading the training and test data:

In [1]:
import os
import pandas as pd
import numpy as np

In [3]:
os.listdir()

['.DS_Store',
 '.git',
 '.ipynb_checkpoints',
 '.Rapp.history',
 '.RDataTmp',
 '.Rhistory',
 'GBM.rds',
 'GBM.stack.rds',
 'Kaggle Mercari Price Suggestion Challenge IPython Kernel.ipynb',
 'KaggleMercariPriceSuggestionChallengeKernel.Rmd',
 'Mercari Price Suggestion Challenge | Kaggle.pdf',
 'merged_table.rds',
 'mini_subtrain.rds',
 'mini_subtrain_locked.rds',
 'mini_train.rds',
 'new_stack_set.rds',
 'new_stack_set2.rds',
 'RF.BC.rds',
 'RF.rds',
 'RF.stack.rds',
 'RF.stack2nd.rds',
 'sample_submission.csv',
 'sample_submission.csv.7z',
 'second.stack.set.rds',
 'subtrain.rds',
 'subtrain_locked.rds',
 'SVM.rds',
 'SVM.stack.rds',
 'test.csv',
 'test.rds',
 'test.tsv.7z',
 'test_locked.rds',
 'train.csv',
 'train.rds',
 'train.tsv.7z',
 'v.txt',
 'validation.rds',
 'validation_locked.rds',
 'xGBM.BC.rds',
 'xGBM.BC2.rds',
 'xGBM.rds',
 'xGBM.stack.rds',
 'xGBM.stack2nd.rds']

In [4]:
train = pd.read_csv("train.csv")

In [5]:
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1482535 entries, 0 to 1482534
Data columns (total 8 columns):
train_id             1482535 non-null int64
name                 1482535 non-null object
item_condition_id    1482535 non-null int64
category_name        1476208 non-null object
brand_name           849853 non-null object
price                1482535 non-null float64
shipping             1482535 non-null int64
item_description     1482531 non-null object
dtypes: float64(1), int64(3), object(4)
memory usage: 90.5+ MB


First, let's remove the train_id from the training set, and map the numeric and character variables to being with:

In [10]:
train = train.drop("train_id",axis = 1)
train.head()

Unnamed: 0,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1482535 entries, 0 to 1482534
Data columns (total 7 columns):
name                 1482535 non-null object
item_condition_id    1482535 non-null int64
category_name        1476208 non-null object
brand_name           849853 non-null object
price                1482535 non-null float64
shipping             1482535 non-null int64
item_description     1482531 non-null object
dtypes: float64(1), int64(2), object(4)
memory usage: 79.2+ MB


In [59]:
Numeric_features_select = np.logical_or((pd.Series(train.dtypes)) == "int64",(pd.Series(train.dtypes) == "float64")).tolist()

In [67]:
Numeric_features = pd.Series(train.columns)[Numeric_features_select].tolist()

In [68]:
Numeric_features

['item_condition_id', 'price', 'shipping']

In [71]:
Text_features = pd.Series(train.columns)[np.logical_not(Numeric_features_select)].tolist()

In [72]:
Text_features

['name', 'category_name', 'brand_name', 'item_description']

Since the model evaluation will be based on the log of the target variable price, we convert it at this stage:

In [73]:
train.price = pd.Series(np.log(train.price + 1))

In [74]:
train.head()

Unnamed: 0,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,2.397895,1,No description yet
1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,3.970292,0,This keyboard is in great condition and works ...
2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,2.397895,1,Adorable top with a hint of lace and a key hol...
3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,3.583519,1,New with tags. Leather horses. Retail for [rm]...
4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,3.806662,0,Complete with certificate of authenticity


Now it is time to split the training data set into training and holdout sets in order to start building our pipeline:

In [76]:
from sklearn.model_selection import train_test_split

X = train.drop("price", axis = 1)
y = train.price

In [91]:
X_train,X_holdout,y_train,y_holdout = train_test_split(X,y,test_size = 0.4, random_state = 425)

In [84]:
type(X_train)

pandas.core.frame.DataFrame

In [85]:
type(y_holdout)

pandas.core.series.Series

In [92]:
X_train.shape

(889521, 6)

In [93]:
X_holdout.shape

(593014, 6)

In [94]:
y_train.shape

(889521,)

In [95]:
y_holdout.shape

(593014,)

At this stage, let's lockdown and save the datasets on whick we will train and validate our model:

In [None]:
X_train.to_csv("X_train.csv",index=False)