In [1]:
import matplotlib
matplotlib.use("TkAgg")
import pandas as pd
from sklearn.impute import KNNImputer
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import geopandas as gpd
from geopy.geocoders import Nominatim
from shapely.geometry import Point
from geopy.extra.rate_limiter import RateLimiter
import geodatasets
import mplcursors


In [2]:
cars = pd.read_csv("Used_Car_Price_Prediction.csv")
cars.head(10)

Unnamed: 0,car_name,yr_mfr,fuel_type,kms_run,sale_price,city,times_viewed,body_type,transmission,variant,...,total_owners,broker_quote,original_price,car_rating,ad_created_on,fitness_certificate,emi_starts_from,booking_down_pymnt,reserved,warranty_avail
0,maruti swift,2015,petrol,8063,386399,noida,18715,hatchback,manual,lxi opt,...,2,397677,404177.0,great,2021-04-04T07:09:18.583,True,8975,57960,False,False
1,maruti alto 800,2016,petrol,23104,265499,noida,2676,hatchback,manual,lxi,...,1,272935,354313.0,great,2021-03-22T14:07:32.833,True,6167,39825,False,False
2,hyundai grand i10,2017,petrol,23402,477699,noida,609,hatchback,manual,sports 1.2 vtvt,...,1,469605,,great,2021-03-20T05:36:31.311,True,11096,71655,False,False
3,maruti swift,2013,diesel,39124,307999,noida,6511,hatchback,manual,vdi,...,1,294262,374326.0,great,2021-01-21T12:59:19.299,True,7154,46200,False,False
4,hyundai grand i10,2015,petrol,22116,361499,noida,3225,hatchback,manual,magna 1.2 vtvt,...,1,360716,367216.0,great,2021-04-01T13:33:40.733,True,8397,54225,False,False
5,maruti alto k10,2018,petrol,23534,335299,noida,1055,hatchback,,vxi (o) amt,...,1,343212,439056.0,great,2021-04-13T05:55:16.99,True,7788,50295,False,False
6,maruti ritz,2012,diesel,41213,281999,noida,909,hatchback,manual,vdi,...,1,201200,,great,2020-12-29T07:26:25.321,True,6550,42300,False,False
7,hyundai i20,2012,petrol,38328,321499,noida,2760,hatchback,manual,asta 1.2,...,3,319200,410764.0,great,2021-02-25T15:47:30.3,True,7468,48225,False,False
8,hyundai elite i20,2014,diesel,56402,456199,noida,2475,hatchback,manual,magna 1.4 crdi,...,1,452023,566123.0,great,2021-03-13T11:57:25.71,True,10596,68430,False,False
9,renault kwid,2018,petrol,32703,281299,noida,2497,hatchback,manual,rxl,...,1,264597,344127.0,great,2021-03-20T06:52:56.488,True,6534,42195,False,False


## KNN Imputation

In [3]:
imputer = KNNImputer(n_neighbors=5, weights='distance')
cars['original_price'] = imputer.fit_transform(cars[['original_price']])

In [4]:
cars['transmission'] = cars['transmission'].fillna('unknown')

First, we will **create a Pandas GeoData frame** so we can make map plots, and we start by geocoding each of the cities in the dataframe.


In [5]:
geolocator = Nominatim(user_agent="city_mapper")

geocode = RateLimiter(
    geolocator.geocode,
    min_delay_seconds=3,       # at least 3 seconds between calls (per Nominatim policy)
    max_retries=2,             # retry a couple of times if it fails
    error_wait_seconds=3,      # wait 2 seconds before retrying after an error
    swallow_exceptions=False
)

def geocode_city(city, country="India"):
  location = geocode(f"{city}, {country}")
  if location:
    return location.latitude, location.longitude
  else:
    return None, None

cities = pd.DataFrame(cars["city"].astype('str').str.title().unique(), columns=["city"])
for idx, city in cities["city"].items():
    print(f"Geocoding: {city}...")
    lat, lon = geocode_city(city)
    cities.loc[idx, "lat"] = lat
    cities.loc[idx, "lon"] = lon

# Save geocoded cities
cities.to_csv("cities_geocoded_india.csv", index=False)

Geocoding: Noida...
Geocoding: Gurgaon...
Geocoding: Bengaluru...
Geocoding: New Delhi...
Geocoding: Mumbai...
Geocoding: Pune...


RateLimiter caught an error, retrying (0/2 tries). Called with (*('Pune, India',), **{}).
Traceback (most recent call last):
  File "C:\Users\Noelle Haviland\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\LocalCache\local-packages\Python313\site-packages\urllib3\connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
  File "C:\Users\Noelle Haviland\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\LocalCache\local-packages\Python313\site-packages\urllib3\connection.py", line 565, in getresponse
    httplib_response = super().getresponse()
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.13_3.13.2544.0_x64__qbz5n2kfra8p0\Lib\http\client.py", line 1430, in getresponse
    response.begin()
    ~~~~~~~~~~~~~~^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.13_3.13.2544.0_x64__qbz5n2kfra8p0\Lib\http\client.py", line 331, in begin
    version, status, reason = self._rea

Geocoding: Hyderabad...
Geocoding: Chennai...
Geocoding: Kolkata...
Geocoding: Ahmedabad...
Geocoding: Faridabad...
Geocoding: Ghaziabad...
Geocoding: Lucknow...


Now, we can create a GeoDataFrame that includes latitude and longitude information for each city.


In [6]:
# Create GeoData Frame
cars["city"] = cars["city"].astype("str").str.title()
df = pd.merge(cars, cities, how="outer", on="city")
geometry = [Point(xy) for xy in zip(df["lon"], df["lat"])]
map_cars = gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:4326")

Now, we can load in a basic map of India.

In [7]:
world = gpd.read_file("ne_110m_admin_0_countries/ne_110m_admin_0_countries.shp")
world = world.set_crs("EPSG:4326")

# Extract India
india = world[world["ADMIN"] == "India"]

fig, ax = plt.subplots(figsize=(10, 6))

india.plot(ax=ax, color='lightblue', edgecolor='gray')
map_cars.plot(ax=ax, markersize=40)

for x, y, label in zip(map_cars.geometry.x, map_cars.geometry.y, map_cars["city"]):
  ax.text(x + 0.3, y + 0.3, label, fontsize=7)

plt.tight_layout()
plt.show()

#### Top Five Car Brands in Each City (interactive)

In [8]:
# Count brands per city
map_cars["brand"] = map_cars["car_name"].astype("str").str.split(" ").str[0].str.title()

top5_list = (
    map_cars.groupby(["city", "brand"]).size().groupby(level=0, group_keys=False)
    .nlargest(5).reset_index(name="count").groupby("city")["brand"].agg(list)
    .reset_index().rename(columns={"brand": "top5"})
)

map_cars = map_cars.merge(top5_list, on="city", how="left")

In [9]:
fig, ax = plt.subplots(figsize=(10, 6))

india.plot(ax=ax, color='lightblue', edgecolor='gray')
points = ax.scatter(map_cars.geometry.x, map_cars.geometry.y, s=40)


for x, y, label in zip(map_cars.geometry.x, map_cars.geometry.y, map_cars["city"]):
  ax.text(x + 0.3, y + 0.3, label, fontsize=7)

cursor = mplcursors.cursor(points, hover=True)
cursor.annotation_kwargs["animated"]  = False

# An attempt to reduce tooltip lag
for c in ax.collections:
   if c is not points:
      c.set_picker(False)

@cursor.connect("add")
def on_add(sel):
  i = sel.index
  row = map_cars.iloc[i]

  brands = row["top5"]
  if isinstance(brands, list):
      brands_text = "\n".join(f"{j+1}. {b}" for j, b in enumerate(brands[:5]))
  else:
      brands_text = brands

  sel.annotation.set_text(
    f"{row['city']}\n\nTop Car Brands:\n{brands_text}"
  )


plt.tight_layout()
plt.show(block=False)

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

india.plot(ax=ax, color='lightblue', edgecolor='gray')
points = map_cars.plot(ax=ax, markersize=40)

cursor = mplcursors.cursor(points, hover=True)
cursor.annotation_kwargs["animated"] = False

for c in ax.collections:
  if c is not points:
    c.set_picker(False)

@cursor.connnect("add")
def on_add(sel):
  i = sel.index

In [None]:
#TODO: Possibly add additional map showing highest average sale price, most popular transmission type, or other factors.