In [3]:
import pandas as pd
import numpy as np
df=pd.read_csv('./auto-mpg.csv')
df['horsepower'].replace('?',np.nan,inplace=True)
df.dropna(subset=['horsepower'],axis=0,inplace=True)
df['horsepower']=df['horsepower'].astype('float')
print(df.head(20))

     mpg  cylinders  displacement  horsepower  weight  acceleration  \
0   18.0          8         307.0       130.0    3504          12.0   
1   15.0          8         350.0       165.0    3693          11.5   
2   18.0          8         318.0       150.0    3436          11.0   
3   16.0          8         304.0       150.0    3433          12.0   
4   17.0          8         302.0       140.0    3449          10.5   
5   15.0          8         429.0       198.0    4341          10.0   
6   14.0          8         454.0       220.0    4354           9.0   
7   14.0          8         440.0       215.0    4312           8.5   
8   14.0          8         455.0       225.0    4425          10.0   
9   15.0          8         390.0       190.0    3850           8.5   
10  15.0          8         383.0       170.0    3563          10.0   
11  14.0          8         340.0       160.0    3609           8.0   
12  15.0          8         400.0       150.0    3761           9.5   
13  14

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['horsepower'].replace('?',np.nan,inplace=True)


In [4]:
count,bin_dividers=np.histogram(df['horsepower'],bins=3)
print(bin_dividers)

[ 46.         107.33333333 168.66666667 230.        ]


In [6]:
bin_names=['Low output','Normal output','High output']
df['hp_bin']=pd.cut(x=df['horsepower'],
                    bins=bin_dividers,
                    labels=bin_names,
                    include_lowest=True)
print(df[['horsepower','hp_bin']].head(15))

    horsepower         hp_bin
0        130.0  Normal output
1        165.0  Normal output
2        150.0  Normal output
3        150.0  Normal output
4        140.0  Normal output
5        198.0    High output
6        220.0    High output
7        215.0    High output
8        225.0    High output
9        190.0    High output
10       170.0    High output
11       160.0  Normal output
12       150.0  Normal output
13       225.0    High output
14        95.0     Low output


In [7]:
df['hp_bin']=pd.cut(x=df['horsepower'],
                    bins=bin_dividers,
                    labels=bin_names,
                    include_lowest=True)
horsepower_dummies=pd.get_dummies(df['hp_bin'])
print(horsepower_dummies.head(15))

    Low output  Normal output  High output
0        False           True        False
1        False           True        False
2        False           True        False
3        False           True        False
4        False           True        False
5        False          False         True
6        False          False         True
7        False          False         True
8        False          False         True
9        False          False         True
10       False          False         True
11       False           True        False
12       False           True        False
13       False          False         True
14        True          False        False


In [8]:
count,bin_dividers=np.histogram(df['weight'],bins=2)
print(bin_dividers)

[1613.  3376.5 5140. ]


In [None]:
bin_names = ["Light", "Heavy"]   

df['hp_binn'] = pd.cut(x=df['weight'],
                       bins=2,
                       labels=bin_names,
                       include_lowest=True)

horsepower_dummies = pd.get_dummies(df['hp_binn'])
print(horsepower_dummies.head(15))



    Light  Heavy
0   False   True
1   False   True
2   False   True
3   False   True
4   False   True
5   False   True
6   False   True
7   False   True
8   False   True
9   False   True
10  False   True
11  False   True
12  False   True
13   True  False
14   True  False


In [None]:
#normalization: max absolute scaling ie divide all values by the max value
print(df.horsepower.describe())
print("\n")
df.horsepower = df.horsepower / abs(df.horsepower.max())
print(df.horsepower.head())
print("\n")
print(df.horsepower.describe())

count    392.000000
mean     104.469388
std       38.491160
min       46.000000
25%       75.000000
50%       93.500000
75%      126.000000
max      230.000000
Name: horsepower, dtype: float64


0    0.565217
1    0.717391
2    0.652174
3    0.652174
4    0.608696
Name: horsepower, dtype: float64


count    392.000000
mean       0.454215
std        0.167353
min        0.200000
25%        0.326087
50%        0.406522
75%        0.547826
max        1.000000
Name: horsepower, dtype: float64


In [14]:
#minmax normalization
print(df.horsepower.describe())
print('\n')
min_x=df.horsepower - df.horsepower.min()
min_max=df.horsepower.max()-df.horsepower.min()
df.horsepower=min_x/min_max
print(df.horsepower.head())
print("\n")
print(df.horsepower.describe())

count    392.000000
mean       0.454215
std        0.167353
min        0.200000
25%        0.326087
50%        0.406522
75%        0.547826
max        1.000000
Name: horsepower, dtype: float64


0    0.456522
1    0.646739
2    0.565217
3    0.565217
4    0.510870
Name: horsepower, dtype: float64


count    392.000000
mean       0.317768
std        0.209191
min        0.000000
25%        0.157609
50%        0.258152
75%        0.434783
max        1.000000
Name: horsepower, dtype: float64
