# Market Research
## Session 5 - Multiple Linear Regression


In [18]:
# Python imports
import pandas as pd
import numpy as np

## Load data

In [19]:
# load used_cars_data.csv

used_cars_data = pd.read_csv("used_cars_data.csv")

#Display the first five rows of used_cars_data using the .head()

print(used_cars_data.head())

#Use .info() to inspect the DataFrame used_cars_data
print(used_cars_data.info())

   S.No.                              Name    Location  ...  Seats  New_Price  Price
0      0            Maruti Wagon R LXI CNG      Mumbai  ...    5.0        NaN   1.75
1      1  Hyundai Creta 1.6 CRDi SX Option        Pune  ...    5.0        NaN  12.50
2      2                      Honda Jazz V     Chennai  ...    5.0  8.61 Lakh   4.50
3      3                 Maruti Ertiga VDI     Chennai  ...    7.0        NaN   6.00
4      4   Audi A4 New 2.0 TDI Multitronic  Coimbatore  ...    5.0        NaN  17.74

[5 rows x 14 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7253 entries, 0 to 7252
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   S.No.              7253 non-null   int64  
 1   Name               7253 non-null   object 
 2   Location           7253 non-null   object 
 3   Year               7253 non-null   int64  
 4   Kilometers_Driven  7253 non-null   int64  
 5   Fuel_Type          

In [20]:
# print column names - iterating the columns
for col in used_cars_data.columns:
    print(col)
    
# shape of the data frame
used_cars_data.shape    

#data_types of columns
used_cars_data.dtypes  

S.No.
Name
Location
Year
Kilometers_Driven
Fuel_Type
Transmission
Owner_Type
Mileage
Engine
Power
Seats
New_Price
Price


S.No.                  int64
Name                  object
Location              object
Year                   int64
Kilometers_Driven      int64
Fuel_Type             object
Transmission          object
Owner_Type            object
Mileage               object
Engine                object
Power                 object
Seats                float64
New_Price             object
Price                float64
dtype: object

### Check for Null values

In [21]:
print(used_cars_data.isnull().sum())

S.No.                   0
Name                    0
Location                0
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
Mileage                 2
Engine                 46
Power                  46
Seats                  53
New_Price            6247
Price                1234
dtype: int64


From the results above, below columns needs data cleaning for regression for Nulls 
Mileage                 2
Engine                 46
Power                  46
Seats                  53
New_Price            6247
Price                1234


### Data Validation - Total Cells vs Missing %

In [23]:
#Find % of missing data
missing_count = used_cars_data.isnull().sum() #number of missing
total_cells = np.product(used_cars_data.shape) # number of cells (cols x rows)
total_missing = missing_count.sum()
missing_percent = (total_missing*100)/total_cells

print('Total : ', total_cells)
print('Total missing : ', total_missing)
print('Missing Percentage: ', missing_percent, '%')

Total :  101542
Total missing :  7628
Missing Percentage:  7.512162454944752 %


### Descriptive statistics

In [16]:
# Descriptive statistics

used_cars_data.describe(include="all").transpose()

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
S.No.,7253.0,,,,3626.0,2093.905084,0.0,1813.0,3626.0,5439.0,7252.0
Name,7253.0,2041.0,Mahindra XUV500 W8 2WD,55.0,,,,,,,
Location,7253.0,11.0,Mumbai,949.0,,,,,,,
Year,7253.0,,,,2013.365366,3.254421,1996.0,2011.0,2014.0,2016.0,2019.0
Kilometers_Driven,7253.0,,,,58699.063146,84427.720583,171.0,34000.0,53416.0,73000.0,6500000.0
Fuel_Type,7253.0,5.0,Diesel,3852.0,,,,,,,
Transmission,7253.0,2.0,Manual,5204.0,,,,,,,
Owner_Type,7253.0,4.0,First,5952.0,,,,,,,
Mileage,7251.0,450.0,17.0 kmpl,207.0,,,,,,,
Engine,7207.0,150.0,1197 CC,732.0,,,,,,,


### Data cleaning column by column

- **Name**            
- Location              
- Year                   
- Kilometers_Driven      
- Fuel_Type             
- Transmission          
- Owner_Type            
- Mileage               
- Engine                
- Power                 
- Seats                
- New_Price             
- Price                


In [11]:
used_cars_data['Name']

# First word contains Brand in the Name column, Let's extract brand from it

# Function to extract the brand from the name
def extract_brand(name):
    return name.split()[0]

# Applying the function to the 'name' column to create a 'brand' column
used_cars_data['Brand'] = used_cars_data['Name'].apply(extract_brand)
print(used_cars_data.head())

   S.No.                              Name  ...  Price    Brand
0      0            Maruti Wagon R LXI CNG  ...   1.75   Maruti
1      1  Hyundai Creta 1.6 CRDi SX Option  ...  12.50  Hyundai
2      2                      Honda Jazz V  ...   4.50    Honda
3      3                 Maruti Ertiga VDI  ...   6.00   Maruti
4      4   Audi A4 New 2.0 TDI Multitronic  ...  17.74     Audi

[5 rows x 15 columns]


### Data cleaning column by column

- ~~Name~~            
- **Location**             
- Year                   
- Kilometers_Driven      
- Fuel_Type             
- Transmission          
- Owner_Type            
- Mileage               
- Engine                
- Power                 
- Seats                
- New_Price             
- Price   

In [17]:
used_cars_data['Location']

0           Mumbai
1             Pune
2          Chennai
3          Chennai
4       Coimbatore
           ...    
7248     Hyderabad
7249        Mumbai
7250       Kolkata
7251          Pune
7252         Kochi
Name: Location, Length: 7253, dtype: object

### Data cleaning column by column

- ~~Name~~            
- **Location**             
- Year                   
- Kilometers_Driven      
- Fuel_Type             
- Transmission          
- Owner_Type            
- Mileage               
- Engine                
- Power                 
- Seats                
- New_Price             
- Price   