In [1]:
import sklearn
import pandas as pd
import numpy as np

import datetime


In [2]:
print(sklearn.__version__)

0.23.2


In [3]:
print(pd.__version__)

1.2.4


In [4]:
print(np.__version__)

1.19.5


In [5]:
automobile_df = pd.read_csv('datasets/cars.csv')
automobile_df.head(5)

Unnamed: 0,Model,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Year,Origin,bore,stroke,compression-ratio
0,chevrolet chevelle malibu,18.0,8,307,130,3504,12.0,1970[1975],"US; Detroit, Michigan",3.47,2.68,9
1,buick skylark 320,15.0,8,350,165,3693,11.5,1970,US],3.47,2.68,?
2,plymouth satellite,18.0,?,318,150,3436,11.0,"1970, 1976",US,2.68,3.47,9
3,amc rebel sst,16.0,8,304,150,3433,12.0,1970,US,3.19,3.4,10
4,ford torino,17.0,8,302,140,3449,10.5,1970-1972,US],3.19,3.4,8


In [6]:
automobile_df.shape

(394, 12)

In [7]:
automobile_df = automobile_df.replace('?', np.nan)
automobile_df.head(5)

Unnamed: 0,Model,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Year,Origin,bore,stroke,compression-ratio
0,chevrolet chevelle malibu,18.0,8.0,307,130,3504,12.0,1970[1975],"US; Detroit, Michigan",3.47,2.68,9.0
1,buick skylark 320,15.0,8.0,350,165,3693,11.5,1970,US],3.47,2.68,
2,plymouth satellite,18.0,,318,150,3436,11.0,"1970, 1976",US,2.68,3.47,9.0
3,amc rebel sst,16.0,8.0,304,150,3433,12.0,1970,US,3.19,3.4,10.0
4,ford torino,17.0,8.0,302,140,3449,10.5,1970-1972,US],3.19,3.4,8.0


In [8]:
automobile_df.isna().sum()

Model                0
MPG                  9
Cylinders            2
Displacement         1
Horsepower           0
Weight               1
Acceleration         1
Year                 0
Origin               0
bore                 0
stroke               0
compression-ratio    2
dtype: int64

In [9]:
automobile_df['MPG'] = automobile_df['MPG'].fillna(automobile_df['MPG'].mean())
automobile_df.isna().sum()

Model                0
MPG                  0
Cylinders            2
Displacement         1
Horsepower           0
Weight               1
Acceleration         1
Year                 0
Origin               0
bore                 0
stroke               0
compression-ratio    2
dtype: int64

We can drop Not a number (NaN) records if we think that there are only few records having missing data

In [10]:
automobile_df = automobile_df.dropna()

In [11]:
automobile_df.shape

(387, 12)

In [12]:
automobile_df.isnull().sum()

Model                0
MPG                  0
Cylinders            0
Displacement         0
Horsepower           0
Weight               0
Acceleration         0
Year                 0
Origin               0
bore                 0
stroke               0
compression-ratio    0
dtype: int64

Dataset has no missing values now

In [13]:
automobile_df.drop(['Model'],axis=1,inplace=True)

In [14]:
automobile_df.sample(5)

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Year,Origin,bore,stroke,compression-ratio
157,14.0,8,351,148,4657,13.5,1975,US,3.19,3.03,9.0
230,16.0,8,351,149,4335,14.5,1977,US,3.58,2.87,8.8
373,31.0,4,91,68,1970,17.6,1982,Japan,2.97,3.23,9.4
346,34.7,4,105,63,2215,14.9,1981,US,3.03,3.15,9.0
223,17.5,6,250,110,3520,16.4,1977,US,3.78,3.15,9.5


In [15]:
automobile_df.drop(['bore','stroke','compression-ratio'],inplace=True,axis=1)
automobile_df.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Year,Origin
0,18.0,8,307,130,3504,12.0,1970[1975],"US; Detroit, Michigan"
3,16.0,8,304,150,3433,12.0,1970,US
4,17.0,8,302,140,3449,10.5,1970-1972,US]
6,14.0,8,454,220,4354,9.0,1970[1975],"US; Detroit, Michigan"
7,23.551429,8,440,215,4312,8.5,1970,US


Removed not important columns in the data

In [16]:
automobile_df['Year'].str.isnumeric().value_counts()

True     351
False     36
Name: Year, dtype: int64

In [17]:
automobile_df['Year'].loc[automobile_df['Year'].str.isnumeric() == False ]

0          1970[1975]
4           1970-1972
6          1970[1975]
30      1971[1973]971
35         1971[1973]
40         1971[1973]
44         1971[1973]
49         1971[1973]
56         1972[1973]
61         1972[1973]
65         1972[1973]
68         1972[1973]
73         1972[1973]
75            1972-73
79            1972-73
82            1972-73
96          1973-1974
97         1973, 1974
102    1973, 19741973
105        1973, 1974
108        1973, 1974
112        1973, 1974
115        1973, 1974
126        1974, 1975
131        1974, 1975
134        1974, 1975
137        1974, 1975
139    1974, 19751974
142        1974, 1975
365         1982-1985
374         1982-1985
380         1982-1985
383         1982-1985
387         1982-1985
389         1982-1985
391         1982-1985
Name: Year, dtype: object

As the year is type of object type we can extract the year with regular expression

In [18]:
extract = automobile_df['Year'].str.extract(r'(\d{4})', expand=False)
extract.head()

0    1970
3    1970
4    1970
6    1970
7    1970
Name: Year, dtype: object

In [19]:
extract

0      1970
3      1970
4      1970
6      1970
7      1970
       ... 
389    1982
390    1982
391    1982
392    1982
393    1982
Name: Year, Length: 387, dtype: object

In [20]:
automobile_df['Year'] = pd.to_numeric(extract)
automobile_df['Year'].dtype

dtype('int64')

In [21]:
automobile_df.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Year,Origin
0,18.0,8,307,130,3504,12.0,1970,"US; Detroit, Michigan"
3,16.0,8,304,150,3433,12.0,1970,US
4,17.0,8,302,140,3449,10.5,1970,US]
6,14.0,8,454,220,4354,9.0,1970,"US; Detroit, Michigan"
7,23.551429,8,440,215,4312,8.5,1970,US


In [22]:
automobile_df['Age'] = datetime.datetime.now().year - automobile_df['Year']
automobile_df.drop(['Year'],axis=1, inplace=True)
automobile_df.sample(5)

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Origin,Age
178,25.0,4,121,115,2671,13.5,Europe,46
106,18.0,6,232,100,2789,15.0,US,48
379,38.0,4,91,67,1965,15.0,Japan,39
31,25.0,4,113,95,2228,14.0,Japan; Aichi],50
136,14.0,8,318,150,4457,13.5,US,47


In [23]:
automobile_df.dtypes

MPG             float64
Cylinders        object
Displacement     object
Horsepower        int64
Weight           object
Acceleration     object
Origin           object
Age               int64
dtype: object

In [24]:
f

NameError: name 'f' is not defined