<a href="https://colab.research.google.com/github/ashasmalik/MLSP24/blob/main/fa21_bse_120_Assign2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 25th March 2024
# CSC354 – Assignment2 – ML – Decision Trees
# Malik Ashas Abbas
# FA21-BSE-120
# classification task using decision trees.
# regression task using decision trees.


** Classification task using decision trees.**

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

data = pd.read_csv("/content/datasaurus.csv")

print(data.head())

features = data.drop('dataset', axis=1)
labels = data['dataset']

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

tree_classifier = DecisionTreeClassifier(random_state=42)
forest_classifier = RandomForestClassifier(random_state=42)

tree_classifier.fit(X_train, y_train)
forest_classifier.fit(X_train, y_train)

tree_predictions = tree_classifier.predict(X_test)
forest_predictions = forest_classifier.predict(X_test)

tree_accuracy = accuracy_score(y_test, tree_predictions)
forest_accuracy = accuracy_score(y_test, forest_predictions)

print("Decision Tree Accuracy:", tree_accuracy)
print("Random Forest Accuracy:", forest_accuracy)

tree_param_grid = {
    'criterion': ['gini'],
    'max_depth': [None, 20],
    'min_samples_split': [2],
    'min_samples_leaf': [1]
}

tree_grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), tree_param_grid, cv=5)
tree_grid_search.fit(X_train, y_train)

print("Best parameters for Decision Tree:", tree_grid_search.best_params_)
print("Best score for Decision Tree:", tree_grid_search.best_score_)

forest_param_dist = {
    'n_estimators': [50],
    'max_features': ['auto'],
    'max_depth': [80] + [None],
    'min_samples_split': [2],
    'min_samples_leaf': [2],
    'bootstrap': [True]
}
forest_random_search = RandomizedSearchCV(RandomForestClassifier(random_state=42), forest_param_dist, n_iter=1, cv=5, random_state=42)
forest_random_search.fit(X_train, y_train)

print("Best parameters for Random Forest:", forest_random_search.best_params_)
print("Best score for Random Forest:", forest_random_search.best_score_)

  dataset        x        y
0    dino  55.3846  97.1795
1    dino  51.5385  96.0256
2    dino  46.1538  94.4872
3    dino  42.8205  91.4103
4    dino  40.7692  88.3333
Decision Tree Accuracy: 0.3972972972972973
Random Forest Accuracy: 0.46216216216216216
Best parameters for Decision Tree: {'criterion': 'gini', 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best score for Decision Tree: 0.4376775080164911


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Best parameters for Random Forest: {'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': None, 'bootstrap': True}
Best score for Random Forest: 0.4742395785616124


**Regression task using decision trees.**


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

car_data = pd.read_csv("cars-dataset.csv")

print(car_data.head())

car_encoded = pd.get_dummies(car_data, drop_first=True)

features = car_encoded.drop('selling_price', axis=1)
target = car_encoded['selling_price']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

regressor = DecisionTreeRegressor(random_state=42)

regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
print("RMSE:", rmse)

params = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

search = GridSearchCV(DecisionTreeRegressor(random_state=42), params, cv=5)
search.fit(X_train, y_train)

print("Best parameters:", search.best_params_)
print("Best score:", search.best_score_)



   year  km_driven    fuel seller_type transmission         owner  \
0  2007      70000  Petrol  Individual       Manual   First Owner   
1  2007      50000  Petrol  Individual       Manual   First Owner   
2  2012     100000  Diesel  Individual       Manual   First Owner   
3  2017      46000  Petrol  Individual       Manual   First Owner   
4  2014     141000  Diesel  Individual       Manual  Second Owner   

   selling_price  
0          60000  
1         135000  
2         600000  
3         250000  
4         450000  
RMSE: 452514.74534208747
Best parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Best score: 0.6641756916269681
