# Car Price Prediction Service - Feature Pipeline

## Installing Hopsworks, the feature store used in this project.


In [None]:
! pip install -U hopsworks --quiet

[K     |████████████████████████████████| 120 kB 8.2 MB/s 
[K     |████████████████████████████████| 50 kB 3.1 MB/s 
[K     |████████████████████████████████| 132 kB 44.6 MB/s 
[K     |████████████████████████████████| 45 kB 2.8 MB/s 
[K     |████████████████████████████████| 68 kB 3.6 MB/s 
[K     |████████████████████████████████| 43 kB 1.5 MB/s 
[K     |████████████████████████████████| 4.9 MB 52.4 MB/s 
[K     |████████████████████████████████| 42 kB 1.4 MB/s 
[K     |████████████████████████████████| 2.8 MB 55.8 MB/s 
[K     |████████████████████████████████| 2.3 MB 42.8 MB/s 
[K     |████████████████████████████████| 4.1 MB 23.7 MB/s 
[K     |████████████████████████████████| 67 kB 4.9 MB/s 
[K     |████████████████████████████████| 109 kB 66.1 MB/s 
[K     |████████████████████████████████| 140 kB 68.0 MB/s 
[K     |████████████████████████████████| 1.6 MB 42.3 MB/s 
[K     |████████████████████████████████| 127 kB 67.7 MB/s 
[K     |████████████████████████████

Run in either "Backfill" or "Normal" operation. 
Backfill is for historical data (here, data scraped from cardekho.com,  imported from kaggle. Traning will be done with this dataset.
Normal operation is for batch inference. Simulating, real data streamed from any storage store. 

IF BACKFILL==True, we will load our DataFrame with data from the car_prices_clean_v2.csv file

ELSE BACKFILL==False, we will load our DataFrame with one (synthetic) car detail sample

In [None]:
import random
import pandas as pd
import hopsworks

BACKFILL = False

## Synthetic Data generation

In [None]:
def generate_vehicle(full_name, selling_price_max, selling_price_min, age_max, age_min,
                     km_driven_max, km_driven_min, mileage_max, mileage_min, engine_max,
                     engine_min, max_power_max, max_power_min, seats, seller_type, 
                     owner_type, fuel_type, transmission_type):
  """
    Returns a single vehicle as a single row in a DataFrame
  """
  df = pd.DataFrame({  "selling_price": [random.uniform(selling_price_max, selling_price_min)],
                       "age": [random.randint(age_min, age_max)],
                       "km_driven": [random.randint(km_driven_min, km_driven_max)],
                       "mileage": [random.uniform(mileage_max, mileage_min)],
                       "engine": [random.uniform(engine_max, engine_min)],
                       "max_power": [random.uniform(max_power_max, max_power_min)]
                      })
  df['full_name'] = full_name
  df['seller_type'] = seller_type
  df['owner_type'] = owner_type
  df['fuel_type'] = fuel_type
  df['transmission_type'] = transmission_type
  df['seats'] = seats

  df = df[['full_name', 'selling_price', 'seller_type', 'km_driven', 'owner_type', 'fuel_type',
           'transmission_type', 'mileage', 'engine', 'max_power', 'seats', 'age']]
  return df

def get_random_vehicle():
  """
    Returns a DataFrame containing one random vehicle chosen from the synthetic data
  """
  Maruti_Swift = generate_vehicle('Maruti Swift Dzire VDI', 6, 1, 10, 3,
                     100000, 100000, 22, 16, 800,
                     600, 70, 30, 5, 'Dealer', 'First Owner', 'Diesel', 'Manual')
  Hyundai_i20_Asta = generate_vehicle('Hyundai_i20_Asta', 6, 3, 8, 3,
                     100000, 20000, 18, 12, 1400,
                     1200, 90, 70, 5, 'Individual', 'First Owner', 'Petrol', 'Manual')
  Ford_Ecosport = generate_vehicle('Ford_Ecosport', 7, 4, 8, 4,
                     100000, 20000, 18, 12, 1700,
                     1200, 120, 80, 5, 'Dealer', 'First Owner', 'Diesel', 'Manual')
  Toyota_Yaris = generate_vehicle('Toyota_Yaris', 7, 4, 10, 5,
                     100000, 20000, 18, 12, 1700,
                     1200, 120, 80, 5, 'Dealer', 'First Owner', 'Diesel', 'Manual')
  Volkswagen_Vento = generate_vehicle('Volkswagen_Vento', 7, 4, 9, 5,
                     100000, 20000, 18, 12, 1700,
                     1200, 120, 80, 5, 'Dealer', 'First Owner', 'Diesel', 'Manual')
  Honda_City = generate_vehicle('Honda_City', 7, 4, 7, 5,
                     100000, 20000, 18, 12, 1700,
                     1200, 120, 80, 5, 'Dealer', 'First Owner', 'Diesel', 'Manual')
  # Nissan_Micra = generate_vehicle(full_name, selling_price_max, selling_price_min, year_max, year_min,
  #                    km_driven_max, km_driven_min, mileage_max, mileage_min, engine_max,
  #                    engine_min, max_power_max, max_power_min, seats_max)
  # Renault_Duster = generate_vehicle(full_name, selling_price_max, selling_price_min, year_max, year_min,
  #                    km_driven_max, km_driven_min, mileage_max, mileage_min, engine_max,
  #                    engine_min, max_power_max, max_power_min, seats_max)
  Tata_Indigo = generate_vehicle('Tata_Indigo', 6, 3, 2016, 2010,
                     100000, 20000, 18, 12, 1400,
                     1200, 90, 70, 5, 'Dealer', 'First Owner', 'Diesel', 'Manual')
  # Mercedes_Benz = generate_vehicle(full_name, selling_price_max, selling_price_min, year_max, year_min,
  #                    km_driven_max, km_driven_min, mileage_max, mileage_min, engine_max,
  #                    engine_min, max_power_max, max_power_min, seats_max)
  pick_random = random.uniform(0,7)
  if pick_random >= 6:
      car_df = Maruti_Swift
  elif pick_random >= 5:
      car_df = Tata_Indigo
  elif pick_random >= 4:
      car_df = Honda_City
  elif pick_random >= 3:
      car_df = Volkswagen_Vento
  elif pick_random >= 2:
      car_df = Toyota_Yaris
  elif pick_random >= 1:
      car_df = Hyundai_i20_Asta
  else:
      car_df = Ford_Ecosport

  return car_df

In [None]:
if BACKFILL == True:
    car_df = pd.read_csv("/content/car_prices_clean_v2.csv")
    car_df = pd.get_dummies(data=car_df, columns=['seller_type', 'owner_type', 'fuel_type', 'transmission_type'])
    car_df = car_df.rename(columns = {'seller_type_Trustmark Dealer': 'seller_type_Trustmark_Dealer', 
                                      'owner_type_First Owner': 'owner_type_First_Owner',
                                      'owner_type_Second Owner': 'owner_type_Second_Owner'})
else:
    car_df = get_random_vehicle()
    car_df = pd.get_dummies(data=car_df, columns=['seller_type', 'owner_type', 'fuel_type', 'transmission_type'])
    car_df = car_df.rename(columns = {'seller_type_Trustmark Dealer': 'seller_type_Trustmark_Dealer', 
                                      'owner_type_First Owner': 'owner_type_First_Owner',
                                      'owner_type_Second Owner': 'owner_type_Second_Owner'})
    
car_df.head()

Unnamed: 0,full_name,selling_price,km_driven,mileage,engine,max_power,seats,age,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark_Dealer,owner_type_First_Owner,owner_type_Second_Owner,fuel_type_CNG,fuel_type_Diesel,fuel_type_Electric,fuel_type_Petrol,transmission_type_Automatic,transmission_type_Manual
0,Toyota Land Cruiser 200 V8 Petrol,92.0,80000,9.0,4461.0,285.4,8.0,6,1,0,0,1,0,0,0,0,1,1,0
1,Land Rover Range Rover Sport 3.0 D SE,92.0,9500,12.65,2993.0,296.0,5.0,5,1,0,0,1,0,0,1,0,0,1,0
2,Toyota Land Cruiser Prado VXL,87.5,36000,11.0,2982.0,170.63,7.0,3,1,0,0,1,0,0,1,0,0,1,0
3,BMW 7 Series 730Ld DPE Signature,85.0,50000,16.77,2993.0,261.49,5.0,4,0,1,0,1,0,0,1,0,0,1,0
4,BMW 7 Series 730Ld DPE Signature,85.0,19000,17.66,2993.0,355.37,4.0,4,1,0,0,1,0,0,1,0,0,1,0


In [None]:
project = hopsworks.login()
fs = project.get_feature_store()

Copy your Api Key (first register/login): https://c.app.hopsworks.ai/account/api/generated

Paste it here: ··········
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/3342
Connected. Call `.close()` to terminate connection gracefully.




In [None]:
car_fg = fs.get_or_create_feature_group(name="car_prices",
                                  version=1,
                                  primary_key=["selling_price",	"km_driven",	"mileage",	"engine",	"max_power",	"seats",	"age"],
                                  description="Car Prices Dataset"
                                 )
car_fg.insert(car_df)



Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/3342/fs/3289/fg/5570


Uploading Dataframe: 0.00% |          | Rows 0/10321 | Elapsed Time: 00:00 | Remaining Time: ?

Launching offline feature group backfill job...
Backfill Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/3342/jobs/named/car_prices_1_offline_fg_backfill/executions


(<hsfs.core.job.Job at 0x7f9b5c659a50>, None)