## An end to end scikit lean workflow

In [27]:
# 1 Get Data Ready
import pandas as pd
import numpy as np
Heart_Disease = pd.read_csv("heart-disease.csv")
Heart_Disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [28]:
# Create X (features matrix) a feature is an input variable
X = Heart_Disease.drop("target", axis = 1)  #X is kind of features in sk leaern || axis=1 means we are skipping col 0 means rows

# Create Y (labels) a label is the output variable
Y = Heart_Disease["target"]

In [29]:
# 2 Choose the right model and hyperpameters(dials on model to tune it)
#Ensemble learning is a machine learning technique that combines the predictions of multiple models to create a more accurate and robust model. The idea is that by combining the strengths of multiple models, we can reduce the errors that any individual model might make.

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

# We keep default hyper prams
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [30]:
# 3 Fit the model to data
from sklearn.model_selection import  train_test_split #The train_test_split function from the sklearn.model_selection library in Python splits arrays or matrices into random subsets for train and test data, respectively. It is a commonly used function in machine learning for evaluating the performance of a model.
# X: The features of the dataset.
# y: The labels of the dataset.
# y: The labels of the dataset.
# random_state: A random number generator seed.
# test_size argument is set to 0.2, which means that 20% of the dataset will be used for the test set and 80% of the dataset will be used for the training set. 

#X_train: The features of the training set.
#X_test: The features of the test set.
#Y_train: The labels of the training set.
#Y_test: The labels of the test set.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [31]:
clf.fit(X_train, Y_train)

In [32]:
#3 Make a predection
#Y_label =  clf.predict(np.array[0,2,3,4]) #"object is not subscriptable" in Python sklearn means that you are trying to access an element of an object that does not support indexing.
# Getting error here because our araay looks nothing like X_train,,,, just a thing that clf will predict on data that looks like training data
X_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
246,56,0,0,134,409,0,0,150,1,1.9,1,2,3
280,42,1,0,136,315,0,1,125,1,1.8,1,0,1
272,67,1,0,120,237,0,1,71,0,1.0,1,0,2
210,57,1,2,128,229,0,0,150,0,0.4,1,1,3
177,64,1,2,140,335,0,1,158,0,0.0,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
233,64,1,0,120,246,0,0,96,1,2.2,0,1,2
51,66,1,0,120,302,0,0,151,0,0.4,1,0,2
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
18,43,1,0,150,247,0,1,171,0,1.5,2,0,2


In [33]:
Y_preds =  clf.predict(X_test)
Y_preds

array([0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [34]:
# 4 Evaluate model on training data and test data
clf.score(X_train, Y_train)

1.0

In [35]:
clf.score(X_test, Y_test)

0.7540983606557377

In [36]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(Y_test, Y_preds))

              precision    recall  f1-score   support

           0       0.70      0.82      0.75        28
           1       0.82      0.70      0.75        33

    accuracy                           0.75        61
   macro avg       0.76      0.76      0.75        61
weighted avg       0.76      0.75      0.75        61



In [37]:
confusion_matrix(Y_test, Y_preds)

array([[23,  5],
       [10, 23]], dtype=int64)

In [38]:
accuracy_score(Y_test, Y_preds)

0.7540983606557377

In [39]:
#5 Improve a model
# Try different amout of n_estimators
np.random.seed(0)
for i in range(10, 100, 10):
    print(f"Trying model with {i} estimators..")
    clf = RandomForestClassifier(n_estimators=i).fit(X_test, Y_test)
    print(f"Model accuracy on test set: {clf.score(X_test, Y_test)}%")
    print("")

Trying model with 10 estimators..
Model accuracy on test set: 0.9836065573770492%

Trying model with 20 estimators..
Model accuracy on test set: 1.0%

Trying model with 30 estimators..
Model accuracy on test set: 1.0%

Trying model with 40 estimators..
Model accuracy on test set: 1.0%

Trying model with 50 estimators..
Model accuracy on test set: 1.0%

Trying model with 60 estimators..
Model accuracy on test set: 1.0%

Trying model with 70 estimators..
Model accuracy on test set: 1.0%

Trying model with 80 estimators..
Model accuracy on test set: 1.0%

Trying model with 90 estimators..
Model accuracy on test set: 1.0%



In [40]:
# Save model and load it
import pickle
pickle.dump(clf, open("random_forest_model_1.pkl", "wb"))

In [41]:
loaded_model = pickle.load(open("random_forest_model_1.pkl", "rb"))
loaded_model.score(X_test, Y_test)

1.0

### Making All Data Numerical
## Extended Car Sales Data

In [42]:
Car_Sales = pd.read_csv("car-sales-extended.csv")

In [43]:
# Split
X = Car_Sales.drop("Price", axis=1)
Y = Car_Sales["Price"]

In [44]:
# to convert objects to number other way is to do it dummies = pd.get_dummies(Car_Sales[["Make", "Colour", "Doors"]])
# Turn Categories into numbers

#HotEncoder converts and ColTrans applies to col maybe i don't know yet
from sklearn.preprocessing import OneHotEncoder #One-hot encoding is a technique used to convert categorical data into a numerical format
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder = "passthrough") #The ColumnTransformer class takes a list of tuples as input. Each tuple in the list specifies the name of the transformer, the transformer object itself, and the list of columns to which the transformer should be applied.
transformed_X = transformer.fit_transform(X)
transformed_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [45]:
#Building machine learning model
from sklearn.ensemble import RandomForestRegressor
#Split into training and test 
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(transformed_X, Y, train_size=0.8)

# Fit the model
np.random.seed(123)
model = RandomForestRegressor()
model.fit(X_train, Y_train)

In [46]:
model.score(X_test, Y_test)

0.1766125736090549

### Handling missing values

1 Fill them with value (imputation)
2 Remove samples with missing altogether

In [47]:
Car_Sales_Missing = pd.read_csv("car-sales-extended-missing-data.csv")
Car_Sales_Missing

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [48]:
Car_Sales_Missing.isna().sum()
X=Car_Sales_Missing.drop("Price", axis=1)
Y=Car_Sales_Missing["Price"]

In [49]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder = "passthrough")
transformed_X = transformer.fit_transform(X)
transformed_X;

## Chosing right estimator/algo/model fro your problem
    SkLearn refres to ml models, algos as estimators
    Classification problem predecting category(like heart disease)
    Regression problem - predecting a number (selling price of car)
    
    
    https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

# Picking ML model for regression

In [50]:
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
housing

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [51]:
housing_df = pd.DataFrame(housing["data"], columns=housing["feature_names"])
housing_df["target"] = housing["target"]
housing_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [83]:
#Import estimator
#from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor

# Random Seed
np.random.seed(111)

# Create Data
X = housing_df.drop("target", axis=1)
Y= housing_df["target"]

#Split into test train
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,Y, test_size=0.8)

#Instaitate n fit
model = RandomForestRegressor()
model.fit(Xtrain, Ytrain)

#Check score
model.score(Xtest, Ytest)

#Tried Lasso accura 0.29464627967544255
# Ridge accuracy 0.605283077743094
# RandomForestRegressor accuracy 0.7667762725866109

0.7667762725866109

In [94]:
from sklearn.ensemble import RandomForestClassifier
# Random Seed
np.random.seed(111)

heart_disease = pd.read_csv("heart-disease.csv")

# Create Data
X = heart_disease.drop("target", axis=1)
Y= heart_disease["target"]

#Split into test train
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,Y, test_size=0.8)

#Instaitate n fit
clf = RandomForestClassifier()
clf.fit(Xtrain, Ytrain)

#Check score
clf.score(Xtest, Ytest)

# LinearSVC accuracy = 0.45267489711934156

0.7489711934156379