## Loading the standard libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Loading the data

In [4]:
data = pd.read_csv('train.tsv', sep = '\t', nrows = 5000)
data.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [5]:
data.shape

(5000, 8)

## Analysis Outline

1. Since the data has numerical columns - Numerical Preprocessing(Missing Value treatment, Scaling)
2. the data also has categorical columns - Categorical Preprocessing(Missing Value, Encoding)
3. the data also has text columns - Text Preprocessing(CountVectorizer)

#### Since instead of making multiple pipelines individually for num, cat and text, Choose Column Transformation directly

### Apply ML on train and test

- Price Prediction  - LinearRegression, KNeighborsRegressor

#### if i want to decide which ML model works best on this data then for Model Selection we have GridSeachCV technique

In [6]:
data.drop(['train_id', 'item_description'], axis = 1, inplace = True)
data.head()

Unnamed: 0,name,item_condition_id,category_name,brand_name,price,shipping
0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1
1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0
2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1
3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1
4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0


## Seperate X and y

In [7]:
X = data.drop('price', axis = 1)
y = data['price']

## Divide the data into train_test_split

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

## Create a Column Transformer for preprocessing Steps

In [10]:
## Numerical processing libraries
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [15]:
from sklearn.pipeline import make_pipeline
num_pipe = make_pipeline(SimpleImputer(strategy = 'median'), StandardScaler())

In [16]:
## Categorical pipeline libraries
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

In [17]:
from sklearn.pipeline import make_pipeline
cat_pipe = make_pipeline(SimpleImputer(strategy = 'most_frequent'), LabelEncoder())

In [18]:
## Text processing libraries
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
text_pipe = make_pipeline(CountVectorizer())

## Column Transformation

In [None]:
num_cols = []
cat_cols = []
text = []

In [None]:
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([
    ('name', num_pipe, num_cols), 
    ('name', cat_pipe, cat_cols),
    
])