## Load the data

In [36]:
import pandas as pd
data = pd.read_csv('Cars93.csv')    # is a dataframe
data.keys()

Index(['Unnamed: 0', 'Manufacturer', 'Model', 'Type', 'Min.Price', 'Price',
       'Max.Price', 'MPG.city', 'MPG.highway', 'AirBags', 'DriveTrain',
       'Cylinders', 'EngineSize', 'Horsepower', 'RPM', 'Rev.per.mile',
       'Man.trans.avail', 'Fuel.tank.capacity', 'Passengers', 'Length',
       'Wheelbase', 'Width', 'Turn.circle', 'Rear.seat.room', 'Luggage.room',
       'Weight', 'Origin', 'Make'],
      dtype='object')

## Any missing data?

In [37]:
data.isnull().sum()

Unnamed: 0             0
Manufacturer           0
Model                  0
Type                   0
Min.Price              0
Price                  0
Max.Price              0
MPG.city               0
MPG.highway            0
AirBags                0
DriveTrain             0
Cylinders              0
EngineSize             0
Horsepower             0
RPM                    0
Rev.per.mile           0
Man.trans.avail        0
Fuel.tank.capacity     0
Passengers             0
Length                 0
Wheelbase              0
Width                  0
Turn.circle            0
Rear.seat.room         2
Luggage.room          11
Weight                 0
Origin                 0
Make                   0
dtype: int64

## Replacing the missing values by the mean of the column
#### There is missing data in  Rear.seat.room  and in Luggage.room columns
#### Dataframes have .mean() functions that excludes NaN values. Each column in a dataFrame is a Series that has also a .mean() function that do the same thing.

In [38]:
data = data.fillna(data.mean())

## Verificando que se hallan removido los nans

In [39]:
data.isnull().sum()

Unnamed: 0            0
Manufacturer          0
Model                 0
Type                  0
Min.Price             0
Price                 0
Max.Price             0
MPG.city              0
MPG.highway           0
AirBags               0
DriveTrain            0
Cylinders             0
EngineSize            0
Horsepower            0
RPM                   0
Rev.per.mile          0
Man.trans.avail       0
Fuel.tank.capacity    0
Passengers            0
Length                0
Wheelbase             0
Width                 0
Turn.circle           0
Rear.seat.room        0
Luggage.room          0
Weight                0
Origin                0
Make                  0
dtype: int64

## Estandarizando con StandardScaler() class

In [40]:
data.dtypes == float

Unnamed: 0            False
Manufacturer          False
Model                 False
Type                  False
Min.Price              True
Price                  True
Max.Price              True
MPG.city              False
MPG.highway           False
AirBags               False
DriveTrain            False
Cylinders             False
EngineSize             True
Horsepower            False
RPM                   False
Rev.per.mile          False
Man.trans.avail       False
Fuel.tank.capacity     True
Passengers            False
Length                False
Wheelbase             False
Width                 False
Turn.circle           False
Rear.seat.room         True
Luggage.room           True
Weight                False
Origin                False
Make                  False
dtype: bool

In [41]:
data.dtypes == int

Unnamed: 0             True
Manufacturer          False
Model                 False
Type                  False
Min.Price             False
Price                 False
Max.Price             False
MPG.city               True
MPG.highway            True
AirBags               False
DriveTrain            False
Cylinders             False
EngineSize            False
Horsepower             True
RPM                    True
Rev.per.mile           True
Man.trans.avail       False
Fuel.tank.capacity    False
Passengers             True
Length                 True
Wheelbase              True
Width                  True
Turn.circle            True
Rear.seat.room        False
Luggage.room          False
Weight                 True
Origin                False
Make                  False
dtype: bool

In [42]:
# escojo solo valores numericos:
keys = ['Min.Price', 'Max.Price', 'MPG.city', 'MPG.highway', 'EngineSize', 'Horsepower', 'RPM', 'Rev.per.mile', 'Fuel.tank.capacity', 'Passengers', 'Length', 'Wheelbase', 'Width', 'Turn.circle', 'Rear.seat.room', 'Luggage.room', 'Weight']

In [43]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit_transform(data[keys])

array([[-0.48578741, -0.28246529,  0.47131249, ..., -0.45219708,
        -1.03301503, -0.62705456],
       [ 1.38801699,  1.53140881, -0.78103212, ...,  0.73809027,
         0.3966429 ,  0.83020814],
       [ 1.00865782,  0.94805231, -0.42321938, ...,  0.05792607,
         0.03922842,  0.51489399],
       ...,
       [ 0.66378585,  0.16416702, -0.78103212, ..., -0.62223813,
         0.3966429 , -0.44809247],
       [ 0.53733279,  0.14593713, -0.244313  , ...,  0.56804922,
         0.03922842, -0.14982233],
       [ 0.88220476,  0.60168439, -0.42321938, ...,  0.73809027,
         0.3966429 ,  0.29332188]])

## ML RandomForestClassifier Model

In [44]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, max_depth=10)   # 10 de altura y 100 arboles

## Splitting into training and test sets

In [47]:
import numpy as np
Y = np.array(data['Price'], dtype=float)
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(np.array(data[keys]), Y, test_size=0.2, random_state=0)

## Fitting the model

In [48]:
model.fit(X_train, Y_train)

ValueError: ignored