In [171]:
import pandas as pd
import plotly.graph_objects as go

### Loading the Data

In [172]:
data = pd.read_csv('./newCarsIndia.csv')
data.sample(5)

Unnamed: 0,Car,Style,Range,Transmission,VehicleType,PriceRange,Rating(outof10)
54,MG ZS EV,Compact SUV,419 Km/Full Charge,Automatic,Electric,₹ 21.99 - 25.88 L,8.2
127,Audi RS5,Premium Sports Coupe,11.1 Km/l,Automatic,Petrol,₹ 1.09 Cr,7.1
239,Mercedes-AMG GLS 63,Premium Fullsize SUV,8.5 Km/l,Automatic,Petrol,₹ 1.57 Cr,7.6
151,Mercedes-Benz E-Class,Premium Fullsize Sedan,12.06 - 14.2 Km/l,Automatic,"Petrol,Diesel",₹ 67 - 85 L,7.8
196,Volkswagen Tiguan AllSpace,SUV,11 Km/l,Automatic,Petrol,₹ 34.2 L,8.3


#### Size of Data

In [178]:
data.shape

(255, 9)

#### Unique Cars

In [120]:
data['Car'].nunique()

255

#### Checking Null Values in The Data

In [117]:
data.isnull().sum()

Car                 0
Style               0
Range              11
Transmission        0
VehicleType         0
PriceRange          0
Rating(outof10)     0
Make                0
max_range          11
dtype: int64

### Data Cleaning and Analysis

#### Creating a column for Make of the Car

- Replacing car brands with two or more words eg Maruti Suzuki  with '-' instead of space, so they can be cleanly seperated. 

In [173]:

data['Car'] = data['Car'].str.lower()
data['Car'] = data['Car'].str.replace('maruti suzuki',
                                      'maruti-suzuki').str.replace(
                                          'aston martin',
                                          'aston-martin').str.replace(
                                              'land rover', 'land-rover')
data['Make'] = data['Car'].str.split(' ').str.get(0)

In [174]:
data['Make'].unique()

array(['tata', 'hyundai', 'maruti-suzuki', 'mg', 'kia', 'mahindra',
       'toyota', 'honda', 'volkswagen', 'audi', 'nissan', 'skoda',
       'renault', 'bmw', 'bajaj', 'mercedes-benz', 'jeep', 'mercedes-amg',
       'datsun', 'rolls-royce', 'ford', 'isuzu', 'land-rover', 'jaguar',
       'lexus', 'force', 'volvo', 'lamborghini', 'citroen', 'ferrari',
       'mini', 'mercedes-maybach', 'porsche', 'aston-martin', 'bentley',
       'maserati', 'byd', 'ssangyong'], dtype=object)

In [175]:
data.sample(3)

Unnamed: 0,Car,Style,Range,Transmission,VehicleType,PriceRange,Rating(outof10),Make
31,audi q7,Premium Fullsize SUV,,Automatic,Petrol,₹ 82.49 - 89.9 L,8.4,audi
104,jaguar i-pace,Premium Midsize Sedan,470 Km/Full Charge,Automatic,Electric,₹ 1.06 - 1.12 Cr,na,jaguar
238,maserati quattroporte,Premium Luxury Sedan,7.9 - 10.7 Km/l,Automatic,"Petrol,Diesel",₹ 1.63 - 2.51 Cr,7.6,maserati


#### Creating a Column for Max Range of a Car

- Using extractall, extracting all the numeric values in the Range column
- Using Unstack making them columns instead of rows
- Selecting the max value among them as "max_range"

In [176]:
pattern = r"([0-9]+.[0-9]*)"
data['max_range'] = data['Range'].str.extractall(pattern).unstack().astype(
    float).max(axis=1)

In [177]:
data.sample(5)

Unnamed: 0,Car,Style,Range,Transmission,VehicleType,PriceRange,Rating(outof10),Make,max_range
105,audi q5,Premium Midsize SUV,13.5 Km/l,Automatic,Petrol,₹ 59.9 - 65.55 L,7.5,audi,13.5
110,land-rover range rover velar,SUV,9.2 - 16.6 Km/l,Automatic,"Petrol,Diesel",₹ 79.87 - 80.71 L,na,land-rover,16.6
149,bmw 7 series,Premium Luxury Sedan,11.86 - 39.53 Km/l,Automatic,"Diesel,Petrol,Hybrid",₹ 1.42 - 1.76 Cr,7.7,bmw,39.53
101,citroen c5 aircross,Compact SUV,18.6 Km/l,Automatic,Diesel,₹ 31.3 - 32.8 L,na,citroen,18.6
102,lamborghini huracan,Supersports Car,10.3 - 11.91 Km/l,Automatic,Petrol,₹ 3.22 - 4.1 Cr,8.5,lamborghini,11.91


#### Filtering Cars with max_range greater than 100

In [128]:
data.query('max_range>100')

Unnamed: 0,Car,Style,Range,Transmission,VehicleType,PriceRange,Rating(outof10),Make,max_range,max_price
0,tata nexon ev,Compact SUV,312 Km/Full Charge,Automatic,Electric,₹ 13.99 - 17.4 L,8.5,tata,312.0,1740000.0
32,tata tigor ev,Subcompact Sedan,306 Km/Full Charge,Automatic,Electric,₹ 12.49 - 13.64 L,na,tata,306.0,1364000.0
52,tata nexon ev max,Compact SUV,437 Km/Full Charge,Automatic,Electric,₹ 17.74 - 19.24 L,na,tata,437.0,1924000.0
54,mg zs ev,Compact SUV,419 Km/Full Charge,Automatic,Electric,₹ 21.99 - 25.88 L,8.2,mg,419.0,2588000.0
71,hyundai kona electric,Compact SUV,452 Km/Full Charge,Automatic,Electric,₹ 23.79 - 23.98 L,8.7,hyundai,452.0,2398000.0
104,jaguar i-pace,Premium Midsize Sedan,470 Km/Full Charge,Automatic,Electric,₹ 1.06 - 1.12 Cr,na,jaguar,470.0,11200000.0
113,audi e-tron gt,Premium Coupe,388 Km/Full Charge,Automatic,Electric,₹ 1.8 Cr,na,audi,388.0,18000000.0
170,byd e6,Subcompact MPV,415 Km/Full Charge,Automatic,Electric,₹ 29.15 L,na,byd,415.0,2915000.0
177,mercedes-benz eqc,Compact SUV,471 Km/Full Charge,Automatic,Electric,₹ 1 Cr,8.2,mercedes-benz,471.0,10000000.0
192,bmw ix,Premium Fullsize SUV,425 Km/Full Charge,Automatic,Electric,₹ 1.16 Cr,na,bmw,425.0,11600000.0


#### Cleaning, Extracting and Convering in the Max Price into Indian Rupee

- Replacing '₹' with a space
- Converting lakh 'L' by multiplying 10^5 and 'Cr' with 10^7
- Doing it in the same dataframe by creating specific filters

In [181]:
data['max_price'] = data['PriceRange'].str.split('-').str.get(-1).str.replace('₹','')

In [182]:
data['max_price'].isnull().sum() 

0

In [183]:
lakh_filter = data['max_price'].str.contains('L')
crore_filter = data['max_price'].str.contains('Cr')

In [184]:
data.loc[lakh_filter, 'max_price'] = data.loc[lakh_filter, 'max_price'].apply(
    lambda x: float(x.split(' ')[1]) * 100000)
data.loc[crore_filter,
         'max_price'] = data.loc[crore_filter, 'max_price'].apply(
             lambda x: float(x.split(' ')[1]) * 10000000)

In [185]:
data.sample(3)

Unnamed: 0,Car,Style,Range,Transmission,VehicleType,PriceRange,Rating(outof10),Make,max_range,max_price
17,hyundai elite i20,Premium Hatchback,18.6 - 25.2 Km/l,"AMT,Automatic,Manual","Petrol,Diesel",₹ 6.98 - 11.47 L,8.2,hyundai,25.2,1147000.0
210,ferrari roma,Sports Car,5.8 Km/l,Manual,Petrol,₹ 3.61 Cr,na,ferrari,5.8,36100000.0
14,toyota glanza,Premium Hatchback,22.35 - 22.94 Km/l,"AMT,Manual",Petrol,₹ 6.39 - 9.69 L,7.7,toyota,22.94,969000.0


#### Top Cars by Style

- The Goal is to Identify top 5% Cars by Price in each of the Style Segment eg: SUV, Compact Sedan
    - Calculate the 95th percentile
    - Using Transform Function, make a new column and add it as a value.
    - Filter cars where "max_price" > 95th percentile value.

In [186]:
data['percentile_by_style'] = data.groupby(
    ['Style'])['max_price'].transform(lambda x: x.quantile(.95))

In [187]:
data.sample(4)

Unnamed: 0,Car,Style,Range,Transmission,VehicleType,PriceRange,Rating(outof10),Make,max_range,max_price,percentile_by_style
114,land-rover discovery,Luxury SUV,9.2 Km/l,Automatic,"Petrol,Diesel",₹ 88.06 L - 1.26 Cr,7.9,land-rover,9.2,12600000.0,47800000.0
33,nissan magnite,Subcompact SUV,17.7 - 20 Km/l,"Automatic,Manual",Petrol,₹ 5.59 - 10 L,na,nissan,20.0,1000000.0,1390500.0
27,honda city,Compact Sedan,17.8 - 24.1 Km/l,"Automatic,Manual","Petrol,Diesel,Hybrid",₹ 11.29 - 19.5 L,8.2,honda,24.1,1950000.0,1826100.0
69,land-rover range rover,Luxury SUV,7.5 - 12.8 Km/l,Automatic,"Diesel,Petrol",₹ 2.32 - 3.41 Cr,7.7,land-rover,12.8,34100000.0,47800000.0


In [188]:
data.query('percentile_by_style<max_price').sort_values(by='Style')

Unnamed: 0,Car,Style,Range,Transmission,VehicleType,PriceRange,Rating(outof10),Make,max_range,max_price,percentile_by_style
60,maruti-suzuki swift,Compact Hatchback,22 Km/l,"AMT,Manual",Petrol,₹ 5.92 - 8.85 L,8.0,maruti-suzuki,22.0,885000.0,876500.0
66,mahindra marazzo,Compact MPV,17.6 Km/l,Manual,Diesel,₹ 12.42 - 14.57 L,8.3,mahindra,17.6,1457000.0,1439200.0
217,audi e-tron,Compact SUV,400 Km/Full Charge,Automatic,Electric,₹ 1.01 - 1.19 Cr,na,audi,400.0,11900000.0,10190000.0
116,mercedes-benz e-class all-terrain,Compact SUV/Crossover,12.1 Km/l,Automatic,Diesel,₹ 77.25 L,na,mercedes-benz,12.1,7725000.0,7217250.0
27,honda city,Compact Sedan,17.8 - 24.1 Km/l,"Automatic,Manual","Petrol,Diesel,Hybrid",₹ 11.29 - 19.5 L,8.2,honda,24.1,1950000.0,1826100.0
29,tata tiago,Entry Hatchback,23.84 Km/l,"AMT,Manual","Petrol,Petrol+CNG",₹ 5.38 - 7.53 L,8.2,tata,23.84,753000.0,744000.0
154,toyota camry,Fullsize Sedan,19.16 Km/l,Automatic,Hybrid,₹ 43.45 L,7.4,toyota,19.16,4345000.0,4307000.0
203,mercedes-benz v-class,Fullsize/Premium MPV,16 Km/l,Automatic,Diesel,₹ 71.1 L - 1.1 Cr,7.9,mercedes-benz,16.0,11000000.0,10616000.0
117,mercedes-amg a 45,Luxury Hatchback,,Automatic,Petrol,₹ 81.5 L,na,mercedes-amg,,8150000.0,7445000.0
197,rolls-royce cullinan,Luxury SUV,7 Km/l,Automatic,Petrol,₹ 6.95 Cr,na,rolls-royce,7.0,69500000.0,47800000.0


#### Maximum Range of Cars by Make

In [189]:
data.columns

Index(['Car', 'Style', 'Range', 'Transmission', 'VehicleType', 'PriceRange',
       'Rating(outof10)', 'Make', 'max_range', 'max_price',
       'percentile_by_style'],
      dtype='object')

In [190]:
petrol_filter = data['VehicleType'].str.contains('Petrol')
data.loc[data.loc[petrol_filter,:].groupby('Make')['max_range'].idxmax()][[
    'Make', 'Car', 'max_range', 'VehicleType'
]]

Unnamed: 0,Make,Car,max_range,VehicleType
143,aston-martin,aston-martin db11,12.0,Petrol
106,audi,audi a4,17.84,Petrol
46,bajaj,bajaj qute,43.0,"Petrol,Petrol+CNG"
234,bentley,bentley continental,10.6,Petrol
149,bmw,bmw 7 series,39.53,"Diesel,Petrol,Hybrid"
132,datsun,datsun redi go,22.0,Petrol
236,ferrari,ferrari sf90 stradale,18.0,Petrol
130,ford,ford figo aspire,24.4,"Petrol,Diesel"
168,honda,honda wr-v,25.5,"Petrol,Diesel"
70,hyundai,hyundai new santro,30.48,"Petrol,Petrol+CNG"


#### Identify Maximum Range of Electric Cars by Make

In [136]:
electric_filter = data['VehicleType'].str.contains('Electric')
data.loc[data.loc[electric_filter, :].dropna(
    subset=['Range']).groupby('Make')['max_range'].idxmax()][[
        'Make', 'Car', 'max_range', 'VehicleType'
    ]]

Unnamed: 0,Make,Car,max_range,VehicleType
245,audi,audi rs e-tron gt,401.0,Electric
192,bmw,bmw ix,425.0,Electric
170,byd,byd e6,415.0,Electric
71,hyundai,hyundai kona electric,452.0,Electric
104,jaguar,jaguar i-pace,470.0,Electric
177,mercedes-benz,mercedes-benz eqc,471.0,Electric
54,mg,mg zs ev,419.0,Electric
254,mini,mini se,270.0,Electric
52,tata,tata nexon ev max,437.0,Electric


#### Most Expensive Cars by Make

In [137]:
data['max_price'] = data['max_price'].astype('float')
data.loc[data.groupby('Style')['max_price'].idxmax()][[
    'Style', 'Car', 'Make', 'max_price'
]].sort_values('Make').reset_index(drop=True)

Unnamed: 0,Style,Car,Make,max_price
0,Premium Sports Sedan,aston-martin dbx,aston-martin,38200000.0
1,Compact SUV,audi e-tron,audi,11900000.0
2,Premium Midsize SUV,audi rs e-tron gt,audi,20500000.0
3,Performance Sedan,audi rs7 sportback,audi,22400000.0
4,Quadricycle,bajaj qute,bajaj,284000.0
5,Subcompact MPV,byd e6,byd,2915000.0
6,Sports Car,ferrari sf90 stradale,ferrari,75000000.0
7,Premium Sports Coupe,ferrari gtc4lusso,ferrari,52000000.0
8,Compact Sedan,honda city,honda,1950000.0
9,Premium Hatchback,hyundai i20 n line,hyundai,1197000.0


#### Percentage of Cars in different "Max Range" Segments

In [138]:
data.groupby(pd.cut(data['max_range'], [0, 10, 20, 30, 40, 100, 500])).agg({
    'Car':
    'count'
}).assign(pct_total=lambda x: round(x / x.sum() * 100))

Unnamed: 0_level_0,Car,pct_total
max_range,Unnamed: 1_level_1,Unnamed: 2_level_1
"(0, 10]",34,14.0
"(10, 20]",147,60.0
"(20, 30]",41,17.0
"(30, 40]",5,2.0
"(40, 100]",3,1.0
"(100, 500]",14,6.0


#### Identifying Highest Rated Cars with only Manual Transmission

In [192]:
#Making the Transmission Column a List
data['Transmission']=data['Transmission'].str.split(',')

In [193]:
from sklearn.preprocessing import MultiLabelBinarizer
#To One Hot Encode the Transmission Column
mlb = MultiLabelBinarizer(sparse_output=True)
data = data.join(
            pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(data.pop('Transmission')),
                index=data.index,
                columns=mlb.classes_))

In [195]:
#Added Three New columns 'AMT','Automatic', 'Manual'
data.columns

Index(['Car', 'Style', 'Range', 'VehicleType', 'PriceRange', 'Rating(outof10)',
       'Make', 'max_range', 'max_price', 'percentile_by_style', 'AMT',
       'Automatic', 'Manual'],
      dtype='object')

In [197]:
data.head(2)

Unnamed: 0,Car,Style,Range,VehicleType,PriceRange,Rating(outof10),Make,max_range,max_price,percentile_by_style,AMT,Automatic,Manual
0,tata nexon ev,Compact SUV,312 Km/Full Charge,Electric,₹ 13.99 - 17.4 L,8.5,tata,312.0,1740000.0,10190000.0,0,1,0
1,hyundai grand i10 nios,Compact Hatchback,20.2 - 26.2 Km/l,"Petrol,Petrol+CNG,Diesel",₹ 5.3 - 8.51 L,8.5,hyundai,26.2,851000.0,876500.0,1,0,1


In [198]:
#Looking at Ratings which has "na" string
na_rating_filter = data['Rating(outof10)']=='na'

In [199]:
data.loc[na_rating_filter,:].groupby(['Make']).size()

Make
aston-martin        3
audi                6
bajaj               1
bmw                 5
byd                 1
citroen             1
ferrari             5
hyundai             2
jaguar              1
kia                 2
lamborghini         2
land-rover          2
lexus               2
mahindra            6
mercedes-amg        7
mercedes-benz       3
mercedes-maybach    2
mg                  2
mini                1
nissan              1
porsche             1
renault             1
rolls-royce         2
skoda               2
tata                4
toyota              2
volkswagen          1
dtype: int64

In [200]:
data.loc[na_rating_filter,:].groupby(['Style']).size()

Style
Compact SUV              8
Compact SUV/Crossover    1
Entry Hatchback          1
Luxury Hatchback         3
Luxury SUV               4
Midsize SUV              1
Premium Coupe            8
Premium Fullsize SUV     3
Premium Hatchback        1
Premium Luxury Sedan     1
Premium Midsize SUV      3
Premium Midsize Sedan    5
Premium Sports Coupe     4
Premium Sports Sedan     3
Quadricycle              1
SUV                      8
Sports Car               3
Subcompact MPV           2
Subcompact SUV           5
Subcompact Sedan         1
Supersports Car          2
dtype: int64

In [202]:
#slicing data which has valid Ratings Data
not_na_rating_filter = data['Rating(outof10)']!='na'

In [203]:
data.loc[not_na_rating_filter,:].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 187 entries, 0 to 252
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype           
---  ------               --------------  -----           
 0   Car                  187 non-null    object          
 1   Style                187 non-null    object          
 2   Range                184 non-null    object          
 3   VehicleType          187 non-null    object          
 4   PriceRange           187 non-null    object          
 5   Rating(outof10)      187 non-null    object          
 6   Make                 187 non-null    object          
 7   max_range            184 non-null    float64         
 8   max_price            187 non-null    object          
 9   percentile_by_style  187 non-null    float64         
 10  AMT                  187 non-null    Sparse[int64, 0]
 11  Automatic            187 non-null    Sparse[int64, 0]
 12  Manual               187 non-null    Sparse[int64, 0]
dtypes: Sp

In [204]:
ratings = data.loc[not_na_rating_filter,:]

In [206]:
#Converting Ratings from Object to Float data
ratings.loc[:, 'Rating(outof10)'] = ratings.loc[:,
                                                'Rating(outof10)'].astype(float)

In [207]:
ratings.loc[ratings.groupby(['Make'])['Rating(outof10)'].idxmax(), :].query(
    'Manual==1 & AMT==0 & Automatic==0')

Unnamed: 0,Car,Style,Range,VehicleType,PriceRange,Rating(outof10),Make,max_range,max_price,percentile_by_style,AMT,Automatic,Manual
91,force motors gurkha,SUV,,Diesel,₹ 13.59 L,6.1,force,,1359000.0,7797700.0,0,0,1
68,isuzu d-max,SUV,14.4 Km/l,Diesel,₹ 6.87 - 8.09 L,8.4,isuzu,14.4,809000.0,7797700.0,0,0,1
