# Building Entire (All in One) Model using sklearn pipeline

In [19]:
import numpy as np
import pandas as pd 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import OneHotEncoder 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
#Getting Data Ready 
data = pd.read_csv('phonedatalarge.csv')
print('****DATA****\n\n',data)
print('\n\n****DATA TYPES****\n\n',data.dtypes)
print('\n\n****MISSING VALUES****\n\n',data.isna().sum())
#cleaning Price, dropping missing values and cobverting type
data.dropna(subset=['Price'],inplace = True)
data['Price'] = data['Price'].str[:-3]
data['Price'] = data['Price'].str.replace(r'[\$\.\,]','',regex=True).astype(int)
print('\n\n****PRICE****\n\n',data['Price'] )
#Defining different transformers using pipeline
cat_transformer = Pipeline(steps = [('imputer',SimpleImputer(strategy = 'constant', fill_value = 'missing')),
                                    ('encoder',OneHotEncoder(handle_unknown = 'ignore',sparse_output=False))])
sim_transformer = Pipeline(steps = [('imputer',SimpleImputer(strategy = 'constant', fill_value= 5))])
num_transformer = Pipeline(steps = [('imputer',SimpleImputer(strategy = 'mean'))])
#Setting up preprocessing steps
preprocessing = ColumnTransformer(transformers = [('category',cat_transformer,['Make','Color']),('sim',sim_transformer,['Sim card']),
                                                  ('num',num_transformer,['Memory(kb)'])],remainder = 'passthrough')
#Model 
model = Pipeline(steps = [('preprocessing',preprocessing),('model',RandomForestRegressor())])
#splitting data 
x = data.drop('Price',axis=1)
y = data['Price']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2)
#fit and evaluate 
model.fit(x_train,y_train)
accuracy = model.score(x_test, y_test)
x_transformed = model.named_steps['preprocessing'].transform(x)
print('\n\n****TRANSFORMED X****\n\n',pd.DataFrame(x_transformed),'\n\n')
accuracy = model.score(x_test,y_test)
print('\n\n****MODEL ACCURACY****\n\n',accuracy)
#tuning hyperparameters using GSCV and Pipeline
hparams = {'preprocessing__num__imputer__strategy':['mean' , 'median'],
           'model__max_depth':[5,10,20], 
           'model__n_estimators': [10,100,200],
           'model__min_samples_split':[2,4],
           'model__min_samples_leaf': [1,2],
           'model__max_features':['sqrt','log2']}
my_gscv_model = GridSearchCV(model,hparams,cv=5,verbose=2)
print('\n\n')
my_gscv_model.fit(x_train,y_train)
gscv_accuracy = my_gscv_model.score(x_test,y_test)
print('\n\n****GSCV MODEL ACCURACY****\n\n',gscv_accuracy)

****DATA****

         Make   Color  Memory(kb)  Sim card     Price
0     Nokia   White     576888.0       4.0  $220.00 
1    Samsung     NaN    255566.0       4.0  $700.00 
2    Iphone    Blue     455666.0       4.0  $600.00 
3   Motorola   Green    134241.0       4.0  $350.00 
4     Nokia   Black     314235.0       3.0  $400.00 
..       ...     ...         ...       ...       ...
94    Nokia    Black    134344.0       3.0  $440.00 
95   Samsung   Blue     134344.0       3.0  $440.00 
96   Samsung   Green    134344.0       3.0  $440.00 
97  Motorola  Black     134344.0       3.0  $440.00 
98   Iphone    Gray     134344.0       3.0       NaN

[99 rows x 5 columns]


****DATA TYPES****

 Make           object
Color          object
Memory(kb)    float64
Sim card      float64
Price          object
dtype: object


****MISSING VALUES****

 Make          1
Color         2
Memory(kb)    2
Sim card      1
Price         3
dtype: int64


****PRICE****

 0     220
1     700
2     600
3     350
4