In [None]:
# importing major libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import cufflinks as cf
from plotly.offline import init_notebook_mode,download_plotlyjs
cf.go_offline(connected=True)

# importing additional libraries
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('JMG_data.csv')

# about the dataset
This dataset contains records of vehicle listings from Craigslist contains most all relevant information that Craigslist provides on car sales such as price, condition, manufacturer, and other attributes.

In [None]:
df.columns

Index(['CarID', 'Listed_Price', 'Listed_Date', 'Make', 'Model', 'Year',
       'Vehicle_Type', 'Size', 'Color', 'Transmission', 'Fuel_Type', 'Drive',
       'Cylinders', 'Odometer', 'Condition', 'Title_Status', 'State',
       'Region'],
      dtype='object')

### Column Descriptions

- **Car ID**: An automatically generated index column.  
- **Listed_Price**: The listed price of the vehicle in USD.  
- **Listed_Date**: The date when the vehicle listing was posted.  
- **Make**: The name of the vehicle’s manufacturer.  
- **Model**: The specific model name of the vehicle.  
- **Year**: The build year of the vehicle.  
- **Vehicle_Type**: The classification of the vehicle.  
- **Size**: The size category of the vehicle.  
- **Color**: The exterior colour of the vehicle.  
- **Transmission**: The type of transmission (e.g., automatic, manual).  
- **Fuel_Type**: The type of energy the vehicle uses (e.g., gas, diesel, hybrid, electric).  
- **Drive**: The drivetrain configuration (e.g., FWD, RWD, AWD).  
- **Cylinders**: The number of engine cylinders.  
- **Odometer**: The vehicle’s mileage in miles.  
- **Condition**: The condition of the vehicle provided by the seller.  
- **Title_Status**: The legal status of the vehicle’s title.  
- **State**: The state abbreviation where the vehicle is listed.  
- **Region**: The region where the vehicle is listed for sale.  

In [None]:
# overview of data

df.head()

Unnamed: 0,CarID,Listed_Price,Listed_Date,Make,Model,Year,Vehicle_Type,Size,Color,Transmission,Fuel_Type,Drive,Cylinders,Odometer,Condition,Title_Status,State,Region
0,1246014,13987,2021-04-24T09:23:15-0500,chevrolet,silverado 1500,2007,truck,mid-size,red,automatic,gas,rwd,8.0,112709,excellent,clean,wi,appleton-oshkosh-FDL
1,692370,2800,2021-04-15T22:07:43-0400,toyota,4runner,1999,SUV,full-size,silver,automatic,gas,rwd,6.0,297053,good,clean,nc,asheville
2,242958,1750,2021-05-02T13:02:12-0600,ford,escape xlt awd,2003,SUV,mid-size,grey,automatic,gas,4wd,6.0,142500,good,clean,co,pueblo
3,310455,9200,2021-04-12T12:20:38-0400,dodge,journey,2015,SUV,full-size,white,automatic,gas,fwd,6.0,111000,excellent,clean,fl,ocala
4,800040,9900,2021-04-30T13:36:13-0400,toyota,rav4 awd,2010,SUV,mid-size,grey,automatic,gas,4wd,4.0,112000,good,clean,ny,buffalo


In [None]:
# converting Listed_Date into Date Time
df['Listed_Date'] = pd.to_datetime(df['Listed_Date'])

In [None]:
# seeking decription of the dataset
df.describe()

Unnamed: 0,CarID,Listed_Price,Year,Cylinders,Odometer
count,62946.0,62946.0,62946.0,62572.0,62946.0
mean,644743.5,12972.07041,2008.470642,5.964505,125021.3
std,360798.2,11740.560353,9.892727,1.651513,242481.2
min,93.0,0.0,1900.0,3.0,0.0
25%,336657.8,4995.0,2006.0,4.0,74042.5
50%,648472.5,9000.0,2011.0,6.0,114917.0
75%,941618.2,17500.0,2014.0,8.0,156000.0
max,1280508.0,75000.0,2022.0,12.0,10000000.0



### **Interpretation & Insights**

- **Listed_Price**  
  The average listed price is around 13,000. Most vehicles are priced between 5,000 and 17,500. There are **entries with a price of 0 or as high as 75,000, indicating possible outliers** or data entry issues.

- **Year**  
  The vehicle manufacturing years range from 1900 to 2022. The **lower extreme (1900) is likely an error.** Most vehicles fall between 2006 and 2014, with the median year being 2011.

- **Cylinders**  
  Most cars have between 4 and 8 cylinders. Some listings have **unusual values like 3 or 12 cylinders.** A few values are missing, which may need to be handled during data preprocessing.

- **Odometer**  
  The mileage values **range from 0 to 10 million miles.** The median mileage is approximately 115,000 miles. **Extremely high values suggest the presence of outliers.** Entries with 0 mileage likely represent missing or incorrect data.

In [None]:
# seeking information from the dataset

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62946 entries, 0 to 62945
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   CarID         62946 non-null  int64  
 1   Listed_Price  62946 non-null  int64  
 2   Listed_Date   62946 non-null  object 
 3   Make          62946 non-null  object 
 4   Model         62946 non-null  object 
 5   Year          62946 non-null  int64  
 6   Vehicle_Type  62946 non-null  object 
 7   Size          62946 non-null  object 
 8   Color         62946 non-null  object 
 9   Transmission  62946 non-null  object 
 10  Fuel_Type     62946 non-null  object 
 11  Drive         62946 non-null  object 
 12  Cylinders     62572 non-null  float64
 13  Odometer      62946 non-null  int64  
 14  Condition     62946 non-null  object 
 15  Title_Status  62946 non-null  object 
 16  State         62946 non-null  object 
 17  Region        62645 non-null  object 
dtypes: float64(1), int64(4), o

### **Interpretation & Insights**

- **Dataset Size and Shape**  
  The dataset contains **62,946 records** and **18 columns**.

- **Data Types**  
  - Majority of the columns are of **object (categorical)** type (13 out of 18), indicating the need for **categorical encoding** before model training.
  - Numerical columns include: `Listed_Price`, `Year`, `Cylinders`, and `Odometer`.

- **Missing Values**  
  - `Cylinders` has **374 missing entries**, which could be imputed using median/mode or predicted based on other vehicle specs.
  - `Region` has **301 missing entries**, which might be recoverable from state or could be labeled as 'Unknown'.
