## Predict the mileage of a car

### Fetch the data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("mpg.csv")

In [3]:
df.head()

Unnamed: 0,MPG,Cylinders,Engine Disp,Horsepower,Weight,Accelerate,Year,Origin
0,15.0,8,390.0,190,3850,8.5,70,American
1,21.0,6,199.0,90,2648,15.0,70,American
2,18.0,6,199.0,97,2774,15.5,70,American
3,16.0,8,304.0,150,3433,12.0,70,American
4,14.0,8,455.0,225,3086,10.0,70,American


### Data cleaning

In [4]:
df.shape

(392, 8)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MPG          392 non-null    float64
 1   Cylinders    392 non-null    int64  
 2   Engine Disp  392 non-null    float64
 3   Horsepower   392 non-null    int64  
 4   Weight       392 non-null    int64  
 5   Accelerate   392 non-null    float64
 6   Year         392 non-null    int64  
 7   Origin       392 non-null    object 
dtypes: float64(3), int64(4), object(1)
memory usage: 24.6+ KB


### Data Analysis

In [6]:
df.corr(numeric_only=True)

Unnamed: 0,MPG,Cylinders,Engine Disp,Horsepower,Weight,Accelerate,Year
MPG,1.0,-0.777618,-0.805127,-0.778427,-0.832244,0.423329,0.580541
Cylinders,-0.777618,1.0,0.950823,0.842983,0.897527,-0.504683,-0.345647
Engine Disp,-0.805127,0.950823,1.0,0.897257,0.932994,-0.5438,-0.369855
Horsepower,-0.778427,0.842983,0.897257,1.0,0.864538,-0.689196,-0.416361
Weight,-0.832244,0.897527,0.932994,0.864538,1.0,-0.416839,-0.30912
Accelerate,0.423329,-0.504683,-0.5438,-0.689196,-0.416839,1.0,0.290316
Year,0.580541,-0.345647,-0.369855,-0.416361,-0.30912,0.290316,1.0


### Identify the input and output

In [7]:
y = df.MPG
x = df.drop(columns=["MPG",'Origin'])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=42, stratify = df.Origin)

In [9]:
y_train.head()

38     14.0
159    14.0
48     30.0
117    24.0
12     10.0
Name: MPG, dtype: float64

In [10]:
X_train.head()

Unnamed: 0,Cylinders,Engine Disp,Horsepower,Weight,Accelerate,Year
38,8,351.0,153,4154,13.5,71
159,8,351.0,148,4657,13.5,75
48,4,88.0,76,2065,14.5,71
117,4,121.0,110,2660,14.0,73
12,8,360.0,215,4615,14.0,70


### Choose the algorithm

In [11]:
from sklearn.linear_model import LinearRegression

In [12]:
model = LinearRegression()

### Train the model

In [13]:
model.fit(X_train,y_train)

### Check the accuracy

In [14]:
model.score(X_train,y_train)

0.819647986733614

In [15]:
model.score(X_test,y_test)

0.7743599795025149

In [16]:
from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import median_absolute_error as mde
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score as r2

In [17]:
original  = y_test
predicted = model.predict(X_test)

In [18]:
print("mape = ", mape(original, predicted))
print("mae = ", mae(original, predicted))
print("mde = ", mde(original, predicted))
print("mse = ", mse(original, predicted))
print("r2 = ", r2(original, predicted))

mape =  0.13157258139223782
mae =  2.9054530894936423
mde =  2.5249527033431
mse =  14.088207073301922
r2 =  0.7743599795025149


In [19]:
df.head()

Unnamed: 0,MPG,Cylinders,Engine Disp,Horsepower,Weight,Accelerate,Year,Origin
0,15.0,8,390.0,190,3850,8.5,70,American
1,21.0,6,199.0,90,2648,15.0,70,American
2,18.0,6,199.0,97,2774,15.5,70,American
3,16.0,8,304.0,150,3433,12.0,70,American
4,14.0,8,455.0,225,3086,10.0,70,American
