## Problem1

1.A cloth manufacturing company is interested to know about the different attributes contributing to high sales. Build a decision tree & random forest model with Sales as target variable (first convert it into categorical variable).

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

data = pd.read_csv('Company_Data.csv')
data.isnull().sum()

Sales          0
CompPrice      0
Income         0
Advertising    0
Population     0
Price          0
ShelveLoc      0
Age            0
Education      0
Urban          0
US             0
dtype: int64

In [2]:
data.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sales        400 non-null    float64
 1   CompPrice    400 non-null    int64  
 2   Income       400 non-null    int64  
 3   Advertising  400 non-null    int64  
 4   Population   400 non-null    int64  
 5   Price        400 non-null    int64  
 6   ShelveLoc    400 non-null    object 
 7   Age          400 non-null    int64  
 8   Education    400 non-null    int64  
 9   Urban        400 non-null    object 
 10  US           400 non-null    object 
dtypes: float64(1), int64(7), object(3)
memory usage: 34.5+ KB


In [12]:
data.shape

(400, 11)

In [9]:
# We need to discretize target Sales
pd.cut(data.Sales, bins=3, labels=False).value_counts()

1    247
0    102
2     51
Name: Sales, dtype: int64

In [13]:
data['Sales'] = pd.cut(data.Sales, bins=3, labels=False)

In [14]:
data.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,1,138,73,11,276,120,Bad,42,17,Yes,Yes
1,2,111,48,16,260,83,Good,65,10,Yes,Yes
2,1,113,35,10,269,80,Medium,59,12,Yes,Yes
3,1,117,100,4,466,97,Medium,55,14,Yes,Yes
4,0,141,64,3,340,128,Bad,38,13,Yes,No


In [15]:
# We need to encode Urban, US and ShelveLoc
lb = LabelEncoder()
data["Urban"] = lb.fit_transform(data["Urban"])
data["US"] = lb.fit_transform(data["US"])

In [18]:
data["ShelveLoc"] = lb.fit_transform(data["ShelveLoc"])

In [19]:
data.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,1,138,73,11,276,120,0,42,17,1,1
1,2,111,48,16,260,83,1,65,10,1,1
2,1,113,35,10,269,80,2,59,12,1,1
3,1,117,100,4,466,97,2,55,14,1,1
4,0,141,64,3,340,128,0,38,13,1,0


In [20]:
data.dtypes

Sales          int64
CompPrice      int64
Income         int64
Advertising    int64
Population     int64
Price          int64
ShelveLoc      int32
Age            int64
Education      int64
Urban          int32
US             int32
dtype: object

In [24]:
# no zero variance
data.var()

Sales              0.367162
CompPrice        235.147243
Income           783.218239
Advertising       44.227343
Population     21719.813935
Price            560.584436
ShelveLoc          0.694680
Age              262.449618
Education          6.867168
Urban              0.208496
US                 0.229549
dtype: float64

In [25]:
colnames = list(data.columns)
predictors = colnames[1:11]
target = colnames[0]

In [26]:
predictors

['CompPrice',
 'Income',
 'Advertising',
 'Population',
 'Price',
 'ShelveLoc',
 'Age',
 'Education',
 'Urban',
 'US']

In [27]:
target

'Sales'

In [28]:
# Splitting data into training and testing data set
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size = 0.3)

In [51]:
from sklearn.tree import DecisionTreeClassifier as DT
model = DT(criterion = 'entropy')
model.fit(train[predictors], train[target])

DecisionTreeClassifier(criterion='entropy')

In [52]:
# Prediction on Test Data
preds = model.predict(test[predictors])
pd.crosstab(test[target], preds, rownames=['Actual'], colnames=['Predictions'])


Predictions,0,1,2
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,19,21,0
1,10,50,7
2,0,9,4


In [53]:
np.mean(preds == test[target]) # Test Data Accuracy 

0.6083333333333333

In [54]:
# Print the confusion matrix (alternate way)
from sklearn import metrics
metrics.confusion_matrix(test[target], preds)

array([[19, 21,  0],
       [10, 50,  7],
       [ 0,  9,  4]], dtype=int64)

In [55]:
# Print the precision and recall, for all 3 classes
print(metrics.classification_report(test[target], preds, digits=3))

              precision    recall  f1-score   support

           0      0.655     0.475     0.551        40
           1      0.625     0.746     0.680        67
           2      0.364     0.308     0.333        13

    accuracy                          0.608       120
   macro avg      0.548     0.510     0.521       120
weighted avg      0.607     0.608     0.600       120



In [56]:
# Prediction on Train Data
preds = model.predict(train[predictors])
pd.crosstab(train[target], preds, rownames = ['Actual'], colnames = ['Predictions'])

Predictions,0,1,2
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,62,0,0
1,0,180,0
2,0,0,38


In [57]:
np.mean(preds == train[target]) # Train Data Accuracy

1.0

In [58]:
# Print the confusion matrix (alternate way)
from sklearn import metrics
metrics.confusion_matrix(train[target], preds)

array([[ 62,   0,   0],
       [  0, 180,   0],
       [  0,   0,  38]], dtype=int64)

In [59]:
# Print the precision and recall, for all 3 classes
print(metrics.classification_report(train[target], preds, digits=3))

              precision    recall  f1-score   support

           0      1.000     1.000     1.000        62
           1      1.000     1.000     1.000       180
           2      1.000     1.000     1.000        38

    accuracy                          1.000       280
   macro avg      1.000     1.000     1.000       280
weighted avg      1.000     1.000     1.000       280



In [60]:
# Building  Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(criterion = 'entropy', random_state = 42)
rfc.fit(train[predictors], train[target])

RandomForestClassifier(criterion='entropy', random_state=42)

In [62]:
# Evaluating on Training set
rfc_pred_train = rfc.predict(train[predictors])
metrics.confusion_matrix(train[target], rfc_pred_train)

array([[ 62,   0,   0],
       [  0, 180,   0],
       [  0,   0,  38]], dtype=int64)

In [63]:
# Print the precision and recall and f1 score, for all 3 classes
print(metrics.classification_report(train[target], rfc_pred_train, digits=3))

              precision    recall  f1-score   support

           0      1.000     1.000     1.000        62
           1      1.000     1.000     1.000       180
           2      1.000     1.000     1.000        38

    accuracy                          1.000       280
   macro avg      1.000     1.000     1.000       280
weighted avg      1.000     1.000     1.000       280



In [64]:
# Evaluating on Test set
rfc_pred_test = rfc.predict(test[predictors])
metrics.confusion_matrix(test[target], rfc_pred_test)

array([[16, 24,  0],
       [ 7, 58,  2],
       [ 0, 11,  2]], dtype=int64)

In [65]:
# Print the precision and recall and f1 score, for all 3 classes
print(metrics.classification_report(test[target], rfc_pred_test, digits=3))

              precision    recall  f1-score   support

           0      0.696     0.400     0.508        40
           1      0.624     0.866     0.725        67
           2      0.500     0.154     0.235        13

    accuracy                          0.633       120
   macro avg      0.606     0.473     0.489       120
weighted avg      0.634     0.633     0.600       120

