# 01. Import Libraries

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Train Test split data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder

# Models for regreesion
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from scipy.stats import randint



#import grid search cv for cross validation
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

# import preprocessors
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Don't show warnings
import warnings
warnings.filterwarnings('ignore')



 ***First of all  we can Import libraries of Regression Task***

In [9]:
# Load data set of Tips
df = sns.load_dataset('tips') # Load data set of tips

In [10]:
df.head(5) # show first 5 rows of data set

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [45]:
# Split tha data into x and y
x = df.drop('tip',axis=1)
y = df['tip']

# Label encode the categorical data
le = LabelEncoder()
df['sex'] = le.fit_transform(x['sex'])
df['smoker'] = le.fit_transform(x['smoker'])
df['day'] = le.fit_transform(x['day'])
df['time'] = le.fit_transform(x['time'])
                              



# Regression Task
## 01. Mean Absolute error

In [46]:
%%time
# Split tha data into train and test
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)


# Create the list of models to be used for Regressor
models = { 
          'LinearRegression' : (LinearRegression(), {}),
          'SVR' : (SVR(), {'kernel': ['rbf', 'poly', 'sigmoid']}),
          'DecisionTreeRegressor' : (DecisionTreeRegressor(random_state=42), {'max_depth': [None, 5, 10]}),
          'RandomForestRegressor' : (RandomForestRegressor(random_state=42), {'n_estimators': [10, 100]}),
          'KNeighborsRegressor' : (KNeighborsRegressor(), {'n_neighbors': np.arange(3, 100, 2)}),
          'GradientBoostingRegressor' : (GradientBoostingRegressor(random_state=42),{'n_estimators': [10, 100]}),
          'XGBRegressor' : (XGBRegressor(), {'n_estimators': [10, 100]}),  
          'AdaBoostRegressor': (AdaBoostRegressor(random_state=42), {'n_estimators': [10, 100]}),        
          }

# Train and predict each model using for loop to itrate through the models
model_scores = []
for name, (model,params) in models.items():

    # Create Pipe line
    pipeline = RandomizedSearchCV(model, params, cv=5, n_iter=20, verbose=1, n_jobs=1)

    # fit each model on training data
    pipeline.fit(x_train,y_train)
    # Make prdictions on each model
    y_pred = pipeline.predict(x_test)

    # Metrics
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    best_parameter = pipeline.best_params_
    
    # Append the model name and evaluation metrics in list
    model_scores.append((name,mae,mse,rmse,best_parameter))
    
    print(f"{name} is trained.")
# selecting the best model from all above models with evaluation metrics sorting method
sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=False)

# printing Each model with evaluation metrics
for model in model_scores:
    print('mse for', f"{model[0]} is {model[3]: .2f}")
print('\n')
for model in model_scores:
    print('mae for', f"{model[0]} is {model[2]: .2f}")

for model in model_scores:
    print('R-2 score for', f"{model[0]} is {model[1]:.2f}")
print('\n')
# Selecting the best model
best_model = sorted_models[0][0]
print(f"The best model is {best_model}")




Fitting 5 folds for each of 1 candidates, totalling 5 fits
LinearRegression is trained.
Fitting 5 folds for each of 3 candidates, totalling 15 fits
SVR is trained.
Fitting 5 folds for each of 3 candidates, totalling 15 fits
DecisionTreeRegressor is trained.
Fitting 5 folds for each of 2 candidates, totalling 10 fits
RandomForestRegressor is trained.
Fitting 5 folds for each of 20 candidates, totalling 100 fits
KNeighborsRegressor is trained.
Fitting 5 folds for each of 2 candidates, totalling 10 fits
GradientBoostingRegressor is trained.
Fitting 5 folds for each of 2 candidates, totalling 10 fits
XGBRegressor is trained.
Fitting 5 folds for each of 2 candidates, totalling 10 fits
AdaBoostRegressor is trained.
mse for LinearRegression is  0.98
mse for SVR is  0.98
mse for DecisionTreeRegressor is  0.98
mse for RandomForestRegressor is  0.98
mse for KNeighborsRegressor is  0.98
mse for GradientBoostingRegressor is  0.98
mse for XGBRegressor is  0.98
mse for AdaBoostRegressor is  0.98


m

***I can Perform Regression Task on tips data.... now we can find the Mean abosulate error..IN all of these `Decision Tree Regressor` error is  `83%`... All of these is best model `SVR` Because is error 57%

In summary, for regression tasks:

Smaller Mean Squared Error (MSE): Indicates better model performance.
Larger Mean Squared Error (MSE): Indicates poorer model performance.
Select the regressor that achieves the lowest mean squared error on your validation or test set.***

## ii. R_squared Score

In [18]:
%%time
# Split tha data into train and test
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)


# Create the list of models to be used for Regressor
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'Random Forest Regressor': RandomForestRegressor(),
    'Support Vector Regressor': SVR(),
    'KNeighbors Regressor': KNeighborsRegressor(),
    'XGBoost Regressor': XGBRegressor(),
    'Gradient Boosting Regressor': GradientBoostingRegressor(),
    # 'Ada Boost Regressor': AdaBoostRegressor()
}

# Train and predict each model using for loop to itrate through the models
model_scores = []
for name, model in models.items():
    # fit each model on training data
    model.fit(x_train,y_train)
    # Make prdictions on each model
    y_pred = model.predict(x_test)
    metric = mean_absolute_error(y_test, y_pred)
    model_scores.append((name, metric))

 

# selecting the best model from all above models with evaluation metrics sorting method
sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=False)
for model in sorted_models:
    print('R_squared Score', f"{model[0]} is {model[1]: .2f}") 



# Select the best model with R_squared Score
model_scores = pd.DataFrame(model_scores, columns=['Model', 'R_squared Score'])
model_scores.sort_values(by='R_squared Score', ascending=False)

R_squared Score Support Vector Regressor is  0.57
R_squared Score Linear Regression is  0.67
R_squared Score XGBoost Regressor is  0.67
R_squared Score Gradient Boosting Regressor is  0.73
R_squared Score KNeighbors Regressor is  0.73
R_squared Score Random Forest Regressor is  0.77
R_squared Score Decision Tree Regressor is  0.92
CPU times: total: 531 ms
Wall time: 428 ms


Unnamed: 0,Model,R_squared Score
1,Decision Tree Regressor,0.921837
2,Random Forest Regressor,0.768749
4,KNeighbors Regressor,0.726245
6,Gradient Boosting Regressor,0.725536
5,XGBoost Regressor,0.67217
0,Linear Regression,0.670381
3,Support Vector Regressor,0.57071


- The R-squared score ranges from 0 to 1, where 1 indicates a perfect fit, and 0 indicates that the model does not explain any variability in the target variable.

- In general:

 `Closer to 1`: Indicates a better fit of the model, suggesting that a larger proportion of the variance in the dependent variable is explained by the independent variables.
 `Closer to 0`: Indicates that the model does not explain much of the variance in the dependent variable.
 So, when evaluating regression models based on R-squared:

 - In summary, for regression tasks:

`Higher R-squared Score`: Indicates better model performance.
`Lower R-squared Score`: Indicates poorer model performance.






# iii. Mean Squared error

In [20]:
%%time
# Split tha data into train and test
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)


# Create the list of models to be used for Regressor
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'Random Forest Regressor': RandomForestRegressor(),
    'Support Vector Regressor': SVR(),
    'KNeighbors Regressor': KNeighborsRegressor(),
    'XGBoost Regressor': XGBRegressor(),
    'Gradient Boosting Regressor': GradientBoostingRegressor(),
    # 'Ada Boost Regressor': AdaBoostRegressor()
}

# Train and predict each model using for loop to itrate through the models
model_scores = []
for name, model in models.items():
    # fit each model on training data
    model.fit(x_train,y_train)
    # Make prdictions on each model
    y_pred = model.predict(x_test)
    metric = mean_absolute_error(y_test, y_pred)
    model_scores.append((name, metric))


# selecting the best model from all above models with evaluation metrics sorting method
sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=False)
for model in sorted_models:
    print('Mean Squared error', f"{model[0]} is {model[1]: .2f}") 



# Select the best model with Mean Squared error
model_scores = pd.DataFrame(model_scores, columns=['Model', 'Mean Squared error'])
model_scores.sort_values(by='Mean Squared error', ascending=False)


Mean Squared error Support Vector Regressor is  0.57
Mean Squared error Linear Regression is  0.67
Mean Squared error XGBoost Regressor is  0.67
Mean Squared error KNeighbors Regressor is  0.73
Mean Squared error Gradient Boosting Regressor is  0.73
Mean Squared error Random Forest Regressor is  0.75
Mean Squared error Decision Tree Regressor is  0.90
CPU times: total: 484 ms
Wall time: 429 ms


Unnamed: 0,Model,Mean Squared error
1,Decision Tree Regressor,0.896122
2,Random Forest Regressor,0.754386
6,Gradient Boosting Regressor,0.7306
4,KNeighbors Regressor,0.726245
5,XGBoost Regressor,0.67217
0,Linear Regression,0.670381
3,Support Vector Regressor,0.57071


The goal is to select the regressor that achieves the smallest mean squared error on the validation or test set. Smaller values of MSE indicate that the predicted values are closer to the actual values, suggesting better accuracy and performance of the model.

- In summary, for regression tasks:

`Smaller Mean Squared Error (MSE)`: Indicates better model performance.

`Larger Mean Squared Error (MSE)`: Indicates poorer model performance.

# Hyperpereamter Tunning

In [26]:
%%time
# Create a dictionaries of list of models to evaluate performance with all hyperparametes
models = { 
          'LinearRegression' : (LinearRegression(), {}),
          'SVR' : (SVR(), {'kernel': ['rbf', 'poly', 'sigmoid'] , 'epsilon': [0.1, 0.2, 0.3]}),
          'DecisionTreeRegressor' : (DecisionTreeRegressor(random_state=42), {'max_depth': [None, 5, 10]}),
          'RandomForestRegressor' : (RandomForestRegressor(random_state=42), {'n_estimators': [10, 100]}),
          'KNeighborsRegressor' : (KNeighborsRegressor(), {'n_neighbors': np.arange(3, 100, 2)}),
          'GradientBoostingRegressor' : (GradientBoostingRegressor(), {'n_estimators': [10, 100]}),
          'XGBRegressor' : (XGBRegressor(), {'n_estimators': [10, 100]}),          
          }

# train and predict each model with evaluation metrics as well making a for loop to iterate over the models
for name, (model, params) in models.items():
    # create a pipline
    pipeline = GridSearchCV(model, params, cv=3)

    # fit the pipeline
    pipeline.fit(x_train, y_train)

    # make predictions
    y_pred = pipeline.predict(x_test)

    # evaluate the model
    print(name)
    print('r2_score: ',r2_score(y_test,y_pred))
    print('mean_squared_error: ',mean_squared_error(y_test,y_pred))
    print('mean_absolute_error: ',mean_absolute_error(y_test,y_pred))
    print('root_mean_squared_error: ',np.sqrt(mean_squared_error(y_test,y_pred)))
    print('--------------------------------------\n')





LinearRegression
r2_score:  0.4441368826121931
mean_squared_error:  0.6948129686287711
mean_absolute_error:  0.6703807496461157
root_mean_squared_error:  0.8335544185167343
--------------------------------------

SVR
r2_score:  -0.1686013018011976
mean_squared_error:  1.460718141299992
mean_absolute_error:  0.8935334948775431
root_mean_squared_error:  1.2086017298101108
--------------------------------------

DecisionTreeRegressor
r2_score:  0.2980516670532911
mean_squared_error:  0.8774153020453991
mean_absolute_error:  0.7189481629481629
root_mean_squared_error:  0.9367044902451355
--------------------------------------

RandomForestRegressor
r2_score:  0.2299337514142753
mean_squared_error:  0.9625607446938791
mean_absolute_error:  0.7750510204081635
root_mean_squared_error:  0.9811018013916186
--------------------------------------

KNeighborsRegressor
r2_score:  0.4687117753876745
mean_squared_error:  0.6640950568462677
mean_absolute_error:  0.6203721488595437
root_mean_squared_er

To determine which model is the best among the Linear Regression, Support Vector Regressor (SVR), Decision Tree Regressor, and Random Forest Regressor, you can consider multiple metrics. In the output you provided, the key metrics are R² score, Mean Squared Error (MSE), Mean Absolute Error (MAE), and Root Mean Squared Error (RMSE). Let's briefly interpret these metrics:

1. **R² Score:**
   - **Higher R² Score:** Indicates a better fit of the model.
   - **Interpretation:** The percentage of the variance in the target variable that is explained by the model. Closer to 1 is better.

2. **Mean Squared Error (MSE):**
   - **Smaller MSE:** Indicates better accuracy and performance.
   - **Interpretation:** The average squared difference between the predicted and actual values. Smaller values are better.

3. **Mean Absolute Error (MAE):**
   - **Smaller MAE:** Indicates better accuracy and performance.
   - **Interpretation:** The average absolute difference between the predicted and actual values. Smaller values are better.

4. **Root Mean Squared Error (RMSE):**
   - **Smaller RMSE:** Indicates better accuracy and performance.
   - **Interpretation:** The square root of the average squared difference between the predicted and actual values. Smaller values are better.

Now, let's analyze the provided metrics:

- **Linear Regression:**
  - R² Score: 0.44
  - MSE: 0.69
  - MAE: 0.67
  - RMSE: 0.83

- **SVR:**
  - R² Score: -0.17
  - MSE: 1.46
  - MAE: 0.89
  - RMSE: 1.21

- **Decision Tree Regressor:**
  - R² Score: 0.30
  - MSE: 0.88
  - MAE: 0.72
  - RMSE: 0.94

- **Random Forest Regressor:**
  - R² Score: 0.23
  - MSE: 0.96
  - MAE: 0.78
  - RMSE: 0.98

Based on these metrics:

- **Linear Regression** has the highest R² score (0.44), indicating a relatively better fit compared to other models.
- **Decision Tree Regressor** has a decent R² score (0.30) and lower MSE, MAE, and RMSE compared to the other models.
- **Random Forest Regressor** also performs reasonably well, but with a slightly lower R² score compared to Linear Regression and Decision Tree.

It's essential to consider the specific requirements of your problem and the trade-offs between metrics. In this case, Linear Regression might be considered the best among the models based on R² score. However, depending on your specific goals and the nature of your data, you might choose a different model. Consider the overall context and any business or problem-specific considerations when making your final decision.

## You code should also save the best model in the pickle file.

In [36]:
# save the model into pickle file
import pickle
pickle.dump(model,open('../Project/model.pkl','wb'))


In [None]:
import joblib

# Save the model using joblib
joblib.dump(model, 'linear_regression_model.joblib')

# Load the model using joblib
loaded_model = joblib.load('linear_regression_model.joblib')

# Use the loaded model to make predictions
predictions = loaded_model.predict(X_test)

print("Predictions:", predictions)


# 02. Classifier

In [19]:
# import classification metrics and libraries
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns

# dn't show warnings
import warnings
warnings.filterwarnings('ignore')

In [23]:
# load the data set of diamonds
df = sns.load_dataset('iris')
df.head(5) # show first 5 rows of data set


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [24]:
# Split tha data into x and y
x = df.drop('species',axis=1)
y = df['species']

In [25]:
%%time
# Split tha data into train and test data set with 80% train and 20% test
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

# Create a dictionaries of list of  classification models to evaluate performance
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree Classifier': DecisionTreeClassifier(),
    'Random Forest Classifier': RandomForestClassifier(),
    # 'Support Vector Classifier': SVC(),
    # 'KNeighbors Classifier': KNeighborsClassifier(),
}

# perform k-fold cross validation

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    scores = cross_val_score(model, x, y, cv=kfold)
    accuracy = np.mean(scores)
    print("model:", name)
    print("Mean Accuracy:", accuracy)
    print()



model: Logistic Regression
Mean Accuracy: 0.9733333333333334

model: Decision Tree Classifier
Mean Accuracy: 0.9533333333333335

model: Random Forest Classifier
Mean Accuracy: 0.9600000000000002

CPU times: total: 2.48 s
Wall time: 3.3 s


- 
It seems like you have worked on three models on dataset of iris because our laptop doesn't have enough capacity, and running them is taking a lot of time on diamond dataset

# Hyperperameter Tunning

In [28]:
%%time
# Create a dictionaries of list of  classification models to evaluate performance with all hyperparametes
models = {
    'Logistic Regression': (LogisticRegression(), {}),
    'Decision Tree Classifier': (DecisionTreeClassifier(random_state=42), {'max_depth': [None, 5, 10]}),
    'Random Forest Classifier': (RandomForestClassifier(random_state=42), {'n_estimators': [10, 100]}),
    'Support Vector Classifier': (SVC(),{}),
    'KNeighbors Classifier': (KNeighborsClassifier(), {'n_neighbors': np.arange(3, 100, 2)}),
}

# train and predict each model with evaluation metrics as well making a for loop to iterate over the models
for name, (model, params) in models.items():
    # create a pipline
    pipeline = GridSearchCV(model, params, cv=3)

    # fit the pipeline
    pipeline.fit(x_train, y_train)

    # make predictions
    y_pred = pipeline.predict(x_test)

    # evaluate the model
    print(name)
    print('accuracy_score: ',accuracy_score(y_test,y_pred))
    print('precision_score: ',precision_score(y_test,y_pred,average='micro'))
    print('recall_score: ',recall_score(y_test,y_pred,average='micro'))
    print('f1_score: ',f1_score(y_test,y_pred,average='micro'))
    print('--------------------------------------\n')

Logistic Regression
accuracy_score:  1.0
precision_score:  1.0
recall_score:  1.0
f1_score:  1.0
--------------------------------------

Decision Tree Classifier
accuracy_score:  1.0
precision_score:  1.0
recall_score:  1.0
f1_score:  1.0
--------------------------------------

Random Forest Classifier
accuracy_score:  1.0
precision_score:  1.0
recall_score:  1.0
f1_score:  1.0
--------------------------------------

Support Vector Classifier
accuracy_score:  1.0
precision_score:  1.0
recall_score:  1.0
f1_score:  1.0
--------------------------------------

KNeighbors Classifier
accuracy_score:  1.0
precision_score:  1.0
recall_score:  1.0
f1_score:  1.0
--------------------------------------

CPU times: total: 5.45 s
Wall time: 6.69 s


In [29]:
# save the model into pickle file
import pickle
pickle.dump(model,open('../Project/classification model.pkl','wb'))