<a href="https://colab.research.google.com/github/Yogi-Puvvala/Machine_Learning/blob/main/Decision_Trees.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Decision Trees (Classification)**

In [159]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report

In [160]:
df = pd.read_csv("drive/MyDrive/Colab_Projects/mushrooms.csv")

In [161]:
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [162]:
df.shape

(8124, 23)

In [163]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [164]:
df.isna().sum()

Unnamed: 0,0
class,0
cap-shape,0
cap-surface,0
cap-color,0
bruises,0
odor,0
gill-attachment,0
gill-spacing,0
gill-size,0
gill-color,0


In [165]:
target = [col for col in df.columns if any(df[col].str.contains("p")) or any(df[col].str.contains("e"))]
target

['class',
 'cap-color',
 'odor',
 'gill-color',
 'stalk-shape',
 'stalk-root',
 'stalk-color-above-ring',
 'stalk-color-below-ring',
 'veil-type',
 'ring-type',
 'habitat']

In [166]:
top_target = [col for col in target if any(df[col].str.contains("p")) and any(df[col].str.contains("e"))]
top_target

['class',
 'cap-color',
 'gill-color',
 'stalk-color-above-ring',
 'stalk-color-below-ring',
 'ring-type']

In [167]:
pipeline = Pipeline([
    ("Nominal", OneHotEncoder()),
    ("model", DecisionTreeClassifier())
])

In [168]:
X = df.drop("class", axis = 1)
y = df["class"]

In [169]:
X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size=0.2, random_state=42)

In [170]:
pipeline.fit(X_train, y_train)

In [171]:
print("Training Score:", pipeline.score(X_train, y_train))
print("Testing Score:", pipeline.score(X_test, y_test))

Training Score: 1.0
Testing Score: 1.0


In [172]:
print(classification_report(y_test, pipeline.predict(X_test)))

              precision    recall  f1-score   support

           e       1.00      1.00      1.00       843
           p       1.00      1.00      1.00       782

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



**♦ Model Trained Perfetly**

# **Decision Tree (Regression)**

In [173]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import classification_report

In [174]:
df = pd.read_csv("drive/MyDrive/Colab_Projects/houses.csv")
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [175]:
df.shape

(1460, 81)

In [176]:
col_na = [col for col in df.columns if df[col].isna().sum() > 0]
col_na

['LotFrontage',
 'Alley',
 'MasVnrType',
 'MasVnrArea',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [177]:
for col in col_na:
    df[col] = df[col].fillna(df[col].mode()[0])

In [178]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1460 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          1460 non-null   object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [179]:
X = df.drop("SalePrice", axis=1)
y = df["SalePrice"]

In [180]:
numerical_cols = [col for col in X.columns if (df[col].dtype == "int64" or df[col].dtype == "float64")]
categorical_cols = [col for col in X.columns if df[col].dtype == "O"]

In [181]:
preprocessor = ColumnTransformer([
    ("num", "passthrough", numerical_cols),
    ("cat", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), categorical_cols)
])

In [182]:
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", DecisionTreeRegressor())
])

In [183]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [184]:
pipeline.fit(X_train, y_train)

In [185]:
print("Training Score:", pipeline.score(X_train, y_train))
print("Testing Score:", pipeline.score(X_test, y_test))

Training Score: 1.0
Testing Score: 0.7714522261178229


In [186]:
param_grid = {
    'model__max_depth': [5, 10, 20, None],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 5],
    'model__max_features': [None, 'sqrt', 'log2']
}

gscv_dtr = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=2
)

gscv_dtr.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [187]:
print("Training Score:", gscv_dtr.score(X_train, y_train))
print("Testing Score:", gscv_dtr.score(X_test, y_test))
gscv_dtr.best_params_

Training Score: 0.9274173551134525
Testing Score: 0.740350527498252


{'model__max_depth': 10,
 'model__max_features': None,
 'model__min_samples_leaf': 5,
 'model__min_samples_split': 5}