# Introduction to Scikit-Learn

In [1]:


 # This notebook demonstrates some of the most useful functions of the beautiful Scikit-Learn Library.

What_we_are_going_to_cover = [

"0. An end -to-end Scikit learn workfolw.",
"1. Getting the data ready.",
"2. Choose the right estimator/algorith for our problems.",
"3. Fit the model/algorithm and use it to make predictions on our data",
"4. Evaluate a model.",
"5. Improve a model.",
"6. Save and Load a trained model.",
"7. Putting it all together!"]


In [2]:
What_we_are_going_to_cover

['0. An end -to-end Scikit learn workfolw.',
 '1. Getting the data ready.',
 '2. Choose the right estimator/algorith for our problems.',
 '3. Fit the model/algorithm and use it to make predictions on our data',
 '4. Evaluate a model.',
 '5. Improve a model.',
 '6. Save and Load a trained model.',
 '7. Putting it all together!']

## 0.An end-to-end Scikit-Learn Workflow.


In [3]:
# 1. Get the data ready
import pandas as pd
import numpy as np
heart_disease = pd.read_csv("heart-disease.csv")
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [4]:
# Create X (features matrix) 

# In summary, you are splitting your dataset into two parts:

   # X, which contains all the independent variables (features) without the target column.
   # Y, which contains only the target column.

#This is a standard procedure in preparing data for supervised learning, where X is used to train the model, and Y is what the model attempts to predict.
# This separation is a common practice when preparing data for machine learning, 
# as it allows you to train your model to learn the relationship between the input features (X) and the target output (Y).

X = heart_disease.drop("target", axis=1)

# Create y (Labels)
Y = heart_disease["target"]

In [5]:
# 2. Choose the right model and hyperparameters
# clf is short for classifier
from sklearn.ensemble import RandomForestClassifier
clf  = RandomForestClassifier()

# we'll keep the default hyperparameters

clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [6]:
# 3 . Fit the model to the training data
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test = train_test_split(X,Y, test_size=0.2)



# Importing the train_test_split function
# from sklearn.model_selection import train_test_split
# This line imports the train_test_split function from the model_selection module of scikit-learn.
# The train_test_split function is a utility that helps you easily split your dataset into a training set and a testing set.

# Splitting the Dataset into Training and Testing Sets
# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
# This line is where the actual splitting of the data occurs.
# train_test_split(X, Y, test_size=0.2) takes your features matrix X and your labels Y and splits them into training and testing sets.
# The test_size=0.2 argument specifies that 20% of the data should be set aside for testing.
# This means that 80% of the data will be used for training the model.
# The function returns four subsets:
#    - X_train: The subset of your features used for training the model.
#    - X_test: The subset of your features used for testing the model.
#    - Y_train: The subset of your labels corresponding to X_train, used for training the model.
#    - Y_test: The subset of your labels corresponding to X_test, used for evaluating the model's performance.

# In simple terms, this process divides your dataset into two parts: 
# one part to train your machine learning model (the training set) and 
# another part to test its performance and see how well it generalizes to new, unseen data (the testing set).
# This is a fundamental practice in machine learning to avoid overfitting, 
# where a model performs well on the training data but poorly on new data.
# By evaluating the model on a separate testing set, 
# you get a better sense of its real-world performance.


In [7]:
clf.fit(X_train, Y_train);

# Fitting the model to the training data

# clf.fit(X_train, Y_train)
# This line is where the model 'learns' from the data.
# The 'fit' method is used to train the model using the training data.
# 'clf' is the instance of RandomForestClassifier we created earlier.

# X_train: This is the training data (features), which the model uses to learn.
# Y_train: These are the true labels for the training data.

# During the fitting process, the RandomForestClassifier will look at the X_train and Y_train data,
# and try to figure out the patterns or relationships between the features and the target label.
# This process involves the RandomForestClassifier building multiple decision trees,
# each looking at different aspects and combinations of the data,
# and then combining their insights to make more accurate predictions.

# Once the model is fitted, it can then be used to make predictions on new, unseen data.
# The 'fit' method is one of the most fundamental and first steps in the model building process in machine learning.


In [19]:
# Make a prediction
Y_label = clf.predict(np.array([0,2,3,4]))

# this fails because the shape of the imput is not corect.
# we are puting in a array.



# Y_label = clf.predict(np.array([0,2,3,4]))
# This line is attempting to use the trained model to make a prediction.
# 'clf.predict()' is the method used to predict the label of new data using the trained model.

# However, this line raises an error because the shape of the input data does not match the shape the model expects.
# The issue here is that we are passing a 1-dimensional array as input, 
# whereas the model expects the input to have the same number of features as the training data (X_train).

# In this case, the model was trained on a certain number of features (columns in X_train),
# but the input provided is a single array with only 4 values.
# Each value in this array is being interpreted as a separate input example rather than a set of features for a single example.

# To fix this, the input data needs to be reshaped or reformatted to match the format of the training data.
# If you're trying to predict for one example, the input should be a 2-dimensional array 
# with one row for the example and columns matching the number of features the model was trained on.

# An example correction could be to reshape the array to have 1 row and the correct number of columns:
# Y_label = clf.predict(np.array([[0, 2, 3, 4]]))
# Note: The inner brackets create a 2D array with one row and four columns.




ValueError: Expected 2D array, got 1D array instead:
array=[0. 2. 3. 4.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
# Handling the Error in Prediction Attempt

# Error Message: 
# "Expected 2D array, got 1D array instead: array=[0. 2. 3. 4.]. 
# Reshape your data either using array.reshape(-1, 1) if your data has a single feature 
# or array.reshape(1, -1) if it contains a single sample."

# This error occurs because the input data provided to clf.predict() is not in the correct format.
# The RandomForestClassifier model expects the input data to be a 2-dimensional array, 
# but the provided data is a 1-dimensional array.

# In the context of the model, a 2D array represents a collection of samples, 
# where each sample has multiple features. 
# The model is trained on such an array, so it expects the same format for making predictions.

# To fix this error, the input data needs to be reshaped into a 2D array. 
# Since the intention is to predict a single sample with multiple features, 
# you should reshape the array to have 1 row (representing 1 sample) and multiple columns (representing features).

# The correct way to reshape and make a prediction would be:
# Y_label = clf.predict(np.array([0, 2, 3, 4]).reshape(1, -1))
# Here, .reshape(1, -1) changes the shape of the array to have 1 row and as many columns as necessary to accommodate the data.

# This reshaping ensures that the data format matches what the model expects, 
# allowing the prediction method to work correctly.


In [16]:
# it has to look like this for it to work
# has to be a 2d array.
X_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
263,63,0,0,108,269,0,1,169,1,1.8,1,2,2
122,41,0,2,112,268,0,0,172,1,0.0,2,0,2
189,41,1,0,110,172,0,0,158,0,0.0,2,0,3
279,61,1,0,138,166,0,0,125,1,3.6,1,1,2
59,57,0,0,128,303,0,0,159,0,0.0,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
220,63,0,0,150,407,0,0,154,0,4.0,1,3,3
33,54,1,2,125,273,0,0,152,0,0.5,0,1,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
214,56,1,0,125,249,1,0,144,1,1.2,1,1,2


In [14]:
Y_preds = clf.predict(X_test)
Y_preds

# Making Predictions on the Test Set

# Y_preds = clf.predict(X_test)
# This line uses the trained RandomForestClassifier (clf) to make predictions on the test data (X_test).
# The 'predict' method of clf is used to predict the labels for each sample in X_test.

# X_test contains the features of the unseen test data. 
# This data was set aside during the train-test split and was not used in training the model.
# The model will use the patterns it learned during training to predict the labels for this new data.

# The predictions made by the model are stored in the variable Y_preds.
# Y_preds will be a numpy array containing the predicted labels for each sample in X_test.

# Y_preds
# This line when executed in a Jupyter Notebook will display the contents of Y_preds.
# It shows the predictions made by the model for the test set.
# These predicted labels can be compared with the actual labels (Y_test) to evaluate the model's performance.

# By comparing Y_preds with Y_test, you can assess how well your model is performing.
# Common ways to evaluate classification models include accuracy, precision, recall, and the confusion matrix.


array([1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1], dtype=int64)

In [21]:
# 4. Evaluate the Model
clf.score(X_train, Y_train)

# Evaluating the Model on the Training Set

# clf.score(X_train, Y_train)
# This line evaluates the performance of your RandomForestClassifier model on the training data.
# The 'score' method returns the accuracy of the model, which is the proportion of correct predictions.

# X_train and Y_train are the features and labels of the training set, respectively.
# The model was trained on this data, so this score tells you how well the model fits the training data.

# However, it's important to note that evaluating the model on the training data can be misleading.
# A high score on the training data might simply mean that the model has memorized the training data (overfitting),
# rather than learning the underlying patterns in the data.

# A more accurate assessment of the model's performance is obtained by evaluating it on the test set (X_test and Y_test),
# which consists of data that the model hasn't seen during training.
# This helps in understanding how well the model generalizes to new, unseen data.

# Generally, in machine learning, it's recommended to look at both the training score and the test score.
# A good model will have high scores on both the training and testing datasets.
# If the model performs well on the training data but poorly on the test data, 
# it's an indication that the model may be overfitting.


1.0

In [23]:
clf.score(X_test,Y_test)

# Evaluating the Model on the Test Set

# clf.score(X_test, Y_test)
# This line evaluates the performance of your RandomForestClassifier model on the test data.
# The 'score' method, in this case, returns the accuracy of the model for the test set,
# which is the proportion of correct predictions out of all predictions made on the test set.

# X_test and Y_test are the features and labels of the test set, respectively.
# The test set is crucial for evaluating the model because it consists of data that the model has not seen during training.
# This helps in understanding how well the model generalizes to new, unseen data.

# A high score on the test set indicates that the model not only learned the patterns in the training data,
# but it is also able to apply these patterns to make accurate predictions on new data.

# It's important to compare the model's performance on the training set (evaluated previously) and the test set.
# Ideally, the model should perform well on both sets. 
# A large discrepancy between training and test scores might indicate issues such as overfitting (if the training score is much higher) 
# or underfitting (if the training score is too low compared to the test score).

# The accuracy obtained here gives a quick overview of how well the model performs, 
# but it's also useful to look at other metrics like precision, recall, F1 score, and confusion matrices
# for a more comprehensive evaluation, especially in cases where the dataset is imbalanced.


0.8688524590163934

In [None]:
# More ways to evaluate the model.

from sklearn.metrics import classification_report, confusion_matrix,accuracy_score

# This below will return the compararison of the test label and the predication label.
print(classification_report(Y_test, Y_preds))


# Generating a Classification Report for Model Evaluation

# from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# This line imports various evaluation metrics from scikit-learn's metrics module,
# including classification_report, which we'll use to evaluate the model.

# Using classification_report to Evaluate the Model
# print(classification_report(Y_test, Y_preds))
# This line generates and prints out the classification report for your model based on the test data.
# The classification report provides detailed performance metrics for each class.

# Y_test: These are the true labels from the test set.
# Y_preds: These are the predicted labels by your model.

# The classification report includes several important metrics:
# - Precision: Measures the accuracy of positive predictions for each class.
# - Recall: Measures the ability of the model to find all the positive samples for each class.
# - F1-score: Provides a balance between precision and recall. It's a harmonic mean of the two.
# - Support: Indicates the number of actual occurrences of each class in the specified dataset.

# These metrics are provided for each class in your target variable and include averages,
# giving you a detailed overview of how well the model is performing for each type of classification.
# This detailed breakdown can help in identifying if the model is underperforming for any particular class 
# and assist in further refining the model or addressing any data imbalances.


In [None]:
confusion_matrix(Y_test,Y_preds)

# Generating a Confusion Matrix for Model Evaluation

# confusion_matrix(Y_test, Y_preds)
# This line generates the confusion matrix for your model based on the test data.
# A confusion matrix is a table often used to describe the performance of a classification model.

# Y_test: These are the true labels from the test set.
# Y_preds: These are the predicted labels by your model.

# The confusion matrix compares the actual target values with those predicted by the model,
# providing a detailed breakdown of:
# - True Positives (TP): Correctly predicted positive observations
# - True Negatives (TN): Correctly predicted negative observations
# - False Positives (FP): Incorrectly predicted positive observations (Type I error)
# - False Negatives (FN): Incorrectly predicted negative observations (Type II error)

# Each row of the matrix represents the instances in an actual class, 
# while each column represents the instances in a predicted class, or vice versa. 
# This setup allows you to see the types of errors (if any) your model is making.

# Interpreting the confusion matrix can provide insights into not only the overall performance of the model
# but also into how it performs on each individual class. It can be particularly useful to identify 
# any biases the model may have towards certain classes and can inform how you might improve the model,
# perhaps by providing more data for underrepresented classes or by tweaking the model itself.

# Generally, a high number of True Positives and True Negatives and low numbers of False Positives and False Negatives
# are indicative of good model performance.


In [None]:
accuracy_score(Y_test,Y_preds)

# Calculating the Accuracy Score for Model Evaluation

# accuracy_score(Y_test, Y_preds)
# This line calculates the accuracy of the model based on the test data.
# Accuracy is one of the most common metrics used to evaluate classification models.

# Y_test: These are the true labels from the test set.
# Y_preds: These are the predicted labels by your model.

# The accuracy score is the ratio of correct predictions to total predictions made:
# Accuracy = (True Positives + True Negatives) / Total Predictions

# It represents how often the classifier is correct overall across all classes.
# In simple terms, it answers the question, "Out of all the classifications, how many did the model get right?"

# While accuracy is a useful metric, it should be considered alongside other metrics like precision, recall, 
# and the F1 score, especially in scenarios where the data is imbalanced or when different types of errors have different costs.

# A high accuracy score indicates that the model has a high rate of correctly predicting both positive and negative classes.
# However, don't rely solely on accuracy if the dataset is imbalanced (i.e., one class is much more frequent than others).
# In such cases, the model might just predict the most common class most of the time and still achieve high accuracy.

# Typically, after getting an overall sense of the model's performance through accuracy, 
# you would dive deeper into the performance details using the confusion matrix and classification report.


In [None]:
# 5. Improve the Model.
# Try diferent amounts of the n_estimators
np.random.seed(42)
for i in range (10,100,10):
    print(f"Trying model with {i} estimators....")
    clf = RandomForestClassifier(n_estimators=i).fit(X_train,Y_train)
    print(f"Model accuracy on the test set: {clf.score(X_test,Y_test) * 100:.2f}%")
    print("")      

# Improving the Model by Tuning n_estimators in RandomForestClassifier

# Trying different amounts of n_estimators
# np.random.seed(42)
# Setting a random seed ensures the results are reproducible.

# for i in range(10, 100, 10):
# This loop iterates through different values for 'n_estimators' starting from 10 up to 90, increasing by 10 each time.
# 'n_estimators' in a RandomForestClassifier refers to the number of trees in the forest.

# print(f"Trying model with {i} estimators....")
# This prints out the number of estimators the model is using in the current iteration.

# clf = RandomForestClassifier(n_estimators=i).fit(X_train, Y_train)
# Here, a new RandomForestClassifier is created with the current number of estimators (i) and fitted to the training data.
# This is done within the loop, so the classifier is retrained for each different value of 'n_estimators'.

# print(f"Model accuracy on the test set: {clf.score(X_test, Y_test) * 100:.2f}%")
# After fitting the model, this line prints out the accuracy of the classifier on the test data.
# Multiplying by 100 converts it into a percentage, and :.2f formats the number to two decimal places.

# print("")  
# This just prints a new line for better readability between each iteration's results.

# By iterating through different values of 'n_estimators', you can observe how changing the number of trees
# impacts the model's accuracy on the test set. 
# The goal is to identify the number of trees that provides the best trade-off between performance and computational efficiency.

# It's important to note that more trees in the forest doesn't always mean a better model. 
# After a certain point, increasing the number of trees may not significantly improve the model's performance and can even 
# lead to longer training times. This experiment helps to find an optimal point or at least a range of 'n_estimators' 
# that offers good model accuracy without unnecessary computational cost.


In [None]:
# 6. Save a model and load it
import pickle

pickle.dump(clf, open("random_forest_model_1.pk1", "wb"))

# Saving the Trained Model for Later Use

# import pickle
# 'pickle' is a Python module used to serialize and deserialize Python objects.
# Serialization is the process of converting a Python object into a byte stream,
# and deserialization is the reverse process. It's useful for saving models and other data structures.

# pickle.dump(clf, open("random_forest_model_1.pk1", "wb"))
# This line saves the trained RandomForestClassifier (clf) to a file.

# clf: This is the trained RandomForestClassifier model that you want to save.

# open("random_forest_model_1.pk1", "wb"): This opens a file named 'random_forest_model_1.pk1' in binary write mode ('wb').
# If the file doesn't exist, it will be created. If it does exist, it will be overwritten.

# The 'dump' function of pickle is used to serialize the 'clf' object and write it to the file.
# This saved model can be loaded later to make predictions without needing to retrain the model.

# Saving the model is particularly useful if the training process is time-consuming,
# or if you want to deploy the model for use in applications, share it with others, or simply keep a version of the model 
# that you can return to later.

# Note: It's important to remember that the version of scikit-learn (or any other libraries used) 
# should be the same when you save the model and when you load it, as changes in the library versions 
# might not be compatible with the model file.



In [None]:
#let load the model "rb" stands for read binaries
loded_model = pickle.load(open("random_forest_model_1.pk1", "rb"))

# score the model .

loded_model.score(X_test,Y_test)

# Loading the Saved Model and Evaluating It

# Loading the model
# loded_model = pickle.load(open("random_forest_model_1.pk1", "rb"))
# This line loads the previously saved model into the variable 'loaded_model'.
# 'pickle.load()' is used to deserialize the object, converting the byte stream back into a Python object.

# open("random_forest_model_1.pk1", "rb"): This opens the file named 'random_forest_model_1.pk1' in binary read mode ('rb').
# The file must exist in the directory you're working in, or you should provide the correct path to the file.

# "rb" stands for "read binary", which is necessary because the model was saved in a binary format.

# Once loaded, 'loaded_model' is essentially a clone of the 'clf' object that was saved earlier.
# It retains all the properties, parameters, and learned patterns of the original trained model.

# Scoring the loaded model
# loaded_model.score(X_test, Y_test)
# After loading the model, you might want to confirm that it's still performing as expected.
# This line uses the 'score' method to evaluate the loaded model's accuracy, just like you did with the original 'clf' model.

# X_test and Y_test are the same test set features and labels used to evaluate the original model.
# This line effectively gives you the accuracy of the model on the test set, 
# allowing you to verify that the model has been loaded correctly and is functioning as expected.

# Using 'loaded_model.score()' is a quick way to ensure that the deserialization process 
# has worked correctly and that the model is ready to be used for predictions or further evaluation.

# This process is particularly useful in operational settings where you might train a model in one script or notebook,
# save it, and then load it in a different script, application, or system to make predictions.


In [11]:
What_we_are_going_to_cover

['0. An end -to-end Scikit learn workfolw.',
 '1. Getting the data ready.',
 '2. Choose the right estimator/algorith for our problems.',
 '3. Fit the model/algorithm and use it to make predictions on our data',
 '4. Evaluate a model.',
 '5. Improve a model.',
 '6. Save and Load a trained model.',
 '7. Putting it all together!']

In [12]:
# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## 1. getting our data ready to be used with machine learning

Three main things we have to do.

1.Split the data into features and labels (usually 'x' and 'y')

2.Filling (also called inputing) or disregarding missing values

3.Converting non-numerical values to numerical values (also called feature encoding)

In [13]:
heart_disease.head()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [14]:
# lets drop target or remove target from the table

x=heart_disease.drop("target", axis=1)

In [15]:
x.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [16]:
# the y axis will be target 
y=heart_disease["target"]
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [17]:
# Split the data into training and test sets 

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y, test_size=0.2)

In [18]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((242, 13), (61, 13), (242,), (61,))

In [19]:
# 242 came from spliting
# 13 came from the number of coulmns
x.shape[0] * 0.8

242.4

In [20]:
len(heart_disease)

303

### 1.1  Make sure its all numerical 

In [22]:
car_sales =pd.read_csv("car-sales-extended.csv")
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [24]:
len(car_sales)

1000

In [25]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [26]:
# Split into X/Y

X = car_sales.drop("Price", axis=1)
Y = car_sales["Price"]

# Split into training and test

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2)


In [40]:
# Build machine learning model

from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train, Y_train)
model.score(X_test,Y_test)

# will Value error if you do not convert  strings into int.

0.3118330305286968

In [41]:
# Turn the catagories into numbers

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

catagorical_features = ["Make", "Colour","Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",one_hot,catagorical_features)],remainder="passthrough")

transformed_X = transformer.fit_transform(X)
transformed_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [42]:
# Put it in a data frame
pd.DataFrame(transformed_X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


In [43]:
# a second way to change to number

dummies = pd.get_dummies(car_sales[["Make", "Colour","Doors"]])
dummies                        

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,False,True,False,False,False,False,False,False,True
1,5,True,False,False,False,False,True,False,False,False
2,4,False,True,False,False,False,False,False,False,True
3,4,False,False,False,True,False,False,False,False,True
4,3,False,False,True,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...
995,4,False,False,False,True,True,False,False,False,False
996,3,False,False,True,False,False,False,False,False,True
997,4,False,False,True,False,False,True,False,False,False
998,4,False,True,False,False,False,False,False,False,True


In [44]:
# Let's refit tje model

np.random.seed(42)
X_train,X_test,Y_train,Y_test = train_test_split(transformed_X,Y, test_size=0.2)

model.fit(X_train,Y_train)

In [45]:
model.score(X_test,Y_test)

0.3235867221569877