# Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv('customers.csv')
Features = dataset.iloc[:, :-1].values
Segmentations = dataset.iloc[:, -1].values

## Taking care of missing data

In [3]:
from sklearn.impute import SimpleImputer

# nan for string variables
nan_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
Features[:, 0:2] = nan_imputer.fit_transform(Features[:, 0:2])
Features[:, 3:5] = nan_imputer.fit_transform(Features[:, 3:5])
Features[:, 6:7] = nan_imputer.fit_transform(Features[:, 6:7])

# nan for a numerical variables
nan_num_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
Features[:, 2:3] = nan_num_imputer.fit_transform(Features[:, 2:3])
Features[:, 5:6] = nan_num_imputer.fit_transform(Features[:, 5:6])
Features[:, 7:8] = nan_num_imputer.fit_transform(Features[:, 7:8])

## Encoding categorical data

### Encoding the Dependent Variable (Gender)

In [4]:
from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()
Features[:,0]= labelEncoder.fit_transform(Features[:,0])

### Encoding the Dependent Variable (Ever_Married)

In [5]:
from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()
Features[:,1]= labelEncoder.fit_transform(Features[:,1])

### Encoding the Dependent Variable (Graduated)

In [6]:
labelEncoder = LabelEncoder()
Features[:,3]= labelEncoder.fit_transform(Features[:,3])

### Encoding the Dependent Variable(Spending_Score)

In [7]:
Spending_Score_Map = {'Low': 0, 'Average': 1, 'High': 2}
for observation in Features:
    observation[6] = Spending_Score_Map[observation[6]]


### Encoding the Dependent Variable(Segmentation)

In [77]:
# labelEncoder = LabelEncoder()
# Segmentations= labelEncoder.fit_transform(Segmentations)

a=[Segmentations]
print(xa[0])
columnTransformer = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
xa = np.array(columnTransformer.fit_transform(a))
print(xa[0])

[1.0 'A' 'B' ... 'D' 'B' 'B']


### Encoding the Independent Variable (Profession)

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
columnTransformer = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [4])], remainder='passthrough')
Features = np.array(columnTransformer.fit_transform(Features))

## Feature Scaling (Age,Work_Experience)

In [10]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
Features[:, 11:12] = sc.fit_transform(Features[:, 11:12])
Features[:, 14:15] = sc.fit_transform(Features[:, 14:15])
Features[:, 15:16] = sc.fit_transform(Features[:, 15:16])

## Splitting the dataset into the Training set and Test set

In [11]:
from sklearn.model_selection import train_test_split
Features_train, Features_test, Segmentations_train, Segmentations_test = train_test_split(Features, Segmentations, test_size = 0.3, random_state = 1)

## Training the Decision Tree Classification model on the Training set

In [12]:
from sklearn.tree import DecisionTreeClassifier
decisionTreeClassifier = DecisionTreeClassifier( random_state = 2,max_depth=5)
decisionTreeClassifier.fit(Features_train, Segmentations_train)
importance = decisionTreeClassifier.feature_importances_
max=-1
maxIndex=-1
print(importance)
for i,v in enumerate(importance):
    if(max<v):
        max=v
        maxIndex=i
print(maxIndex)


[0.16070249 0.         0.         0.         0.         0.05078912
 0.         0.         0.02044964 0.         0.         0.52876895
 0.02662457 0.00072039 0.16821208 0.04373276]
11


## Predicting the Test set results

In [13]:
Segmentations_pred_train = decisionTreeClassifier.predict(Features_train)
Segmentations_pred_test = decisionTreeClassifier.predict(Features_test)

## Accuracy Score for Decision Tree Training 

In [14]:
from sklearn.metrics import accuracy_score
accuracy_score(Segmentations_train,Segmentations_pred_train)

0.5280680007083407

## Features train classification data report

In [15]:
from sklearn.metrics import classification_report
print("features train classification data report\n")
r1 = pd.DataFrame(classification_report(Segmentations_train,Segmentations_pred_train, output_dict=True))
print(r1.to_markdown(tablefmt='grid', floatfmt='.2f'))

features train classification data report

+-----------+---------+---------+---------+---------+------------+-------------+----------------+
|           |       A |       B |       C |       D |   accuracy |   macro avg |   weighted avg |
| precision |    0.43 |    0.44 |    0.56 |    0.66 |       0.53 |        0.52 |           0.53 |
+-----------+---------+---------+---------+---------+------------+-------------+----------------+
| recall    |    0.51 |    0.37 |    0.48 |    0.71 |       0.53 |        0.52 |           0.53 |
+-----------+---------+---------+---------+---------+------------+-------------+----------------+
| f1-score  |    0.47 |    0.40 |    0.52 |    0.69 |       0.53 |        0.52 |           0.53 |
+-----------+---------+---------+---------+---------+------------+-------------+----------------+
| support   | 1378.00 | 1314.00 | 1363.00 | 1592.00 |       0.53 |     5647.00 |        5647.00 |
+-----------+---------+---------+---------+---------+------------+---------

## Accuracy Score for Test 

In [16]:
from sklearn.metrics import accuracy_score
accuracy_score(Segmentations_test,Segmentations_pred_test)

0.5229244114002478

## Features test classification data report

In [17]:
print("features test classification data report\n")
r1 = pd.DataFrame(classification_report(Segmentations_test,Segmentations_pred_test, output_dict=True))
print(r1.to_markdown(tablefmt='grid', floatfmt='.2f'))

features test classification data report

+-----------+--------+--------+--------+--------+------------+-------------+----------------+
|           |      A |      B |      C |      D |   accuracy |   macro avg |   weighted avg |
| precision |   0.45 |   0.38 |   0.59 |   0.64 |       0.52 |        0.51 |           0.52 |
+-----------+--------+--------+--------+--------+------------+-------------+----------------+
| recall    |   0.53 |   0.34 |   0.50 |   0.69 |       0.52 |        0.51 |           0.52 |
+-----------+--------+--------+--------+--------+------------+-------------+----------------+
| f1-score  |   0.49 |   0.36 |   0.54 |   0.66 |       0.52 |        0.51 |           0.52 |
+-----------+--------+--------+--------+--------+------------+-------------+----------------+
| support   | 594.00 | 544.00 | 607.00 | 676.00 |       0.52 |     2421.00 |        2421.00 |
+-----------+--------+--------+--------+--------+------------+-------------+----------------+


## Training the Random Forest Classification model on the Training set

In [18]:
from sklearn.ensemble import RandomForestClassifier
randomForestClassifier = RandomForestClassifier(random_state=5,max_depth=7,n_estimators=9)
randomForestClassifier.fit(Features_train, Segmentations_train)
importance = randomForestClassifier.feature_importances_
max=-1
maxIndex=-1
for i,v in enumerate(importance):
    if(max<v):
        max=v
        maxIndex=i
print(maxIndex)

11


## Predicting the Decision Tree Test set results

In [19]:
Segmentations_pred_train = randomForestClassifier.predict(Features_train)
Segmentations_pred_test = randomForestClassifier.predict(Features_test)

## Making the Confusion Matrix

In [20]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Segmentations_test, Segmentations_pred_test)

## Accuracy Score for Random Forest Train 

In [21]:
from sklearn.metrics import accuracy_score
accuracy_score(Segmentations_train,Segmentations_pred_train)

0.5705684434212857

## Features train classification data report

In [22]:
print("features train classification data report\n")
r1 = pd.DataFrame(classification_report(Segmentations_train,Segmentations_pred_train, output_dict=True))
print(r1.to_markdown(tablefmt='grid', floatfmt='.2f'))

features train classification data report

+-----------+---------+---------+---------+---------+------------+-------------+----------------+
|           |       A |       B |       C |       D |   accuracy |   macro avg |   weighted avg |
| precision |    0.49 |    0.48 |    0.59 |    0.68 |       0.57 |        0.56 |           0.57 |
+-----------+---------+---------+---------+---------+------------+-------------+----------------+
| recall    |    0.53 |    0.43 |    0.55 |    0.74 |       0.57 |        0.56 |           0.57 |
+-----------+---------+---------+---------+---------+------------+-------------+----------------+
| f1-score  |    0.51 |    0.46 |    0.57 |    0.71 |       0.57 |        0.56 |           0.57 |
+-----------+---------+---------+---------+---------+------------+-------------+----------------+
| support   | 1378.00 | 1314.00 | 1363.00 | 1592.00 |       0.57 |     5647.00 |        5647.00 |
+-----------+---------+---------+---------+---------+------------+---------

## Accuracy Score for Random Forest Test 

In [23]:
accuracy_score(Segmentations_test,Segmentations_pred_test)

0.5245766212308963

## Features test calssification data report

In [24]:
print("features test classification data report\n")
r1 = pd.DataFrame(classification_report(Segmentations_test,Segmentations_pred_test, output_dict=True))
print(r1.to_markdown(tablefmt='grid', floatfmt='.2f'))

features test classification data report

+-----------+--------+--------+--------+--------+------------+-------------+----------------+
|           |      A |      B |      C |      D |   accuracy |   macro avg |   weighted avg |
| precision |   0.45 |   0.40 |   0.58 |   0.63 |       0.52 |        0.51 |           0.52 |
+-----------+--------+--------+--------+--------+------------+-------------+----------------+
| recall    |   0.48 |   0.35 |   0.55 |   0.69 |       0.52 |        0.52 |           0.52 |
+-----------+--------+--------+--------+--------+------------+-------------+----------------+
| f1-score  |   0.46 |   0.37 |   0.56 |   0.66 |       0.52 |        0.51 |           0.52 |
+-----------+--------+--------+--------+--------+------------+-------------+----------------+
| support   | 594.00 | 544.00 | 607.00 | 676.00 |       0.52 |     2421.00 |        2421.00 |
+-----------+--------+--------+--------+--------+------------+-------------+----------------+


## Importing the new customers

In [25]:
dataset = pd.read_csv('new_customers.csv')
newFeatures = dataset.iloc[:, :].values

## Taking care of missing data

In [26]:
# nan for string variables
nan_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
newFeatures[:, 0:2] = nan_imputer.fit_transform(newFeatures[:, 0:2])
newFeatures[:, 3:5] = nan_imputer.fit_transform(newFeatures[:, 3:5])
newFeatures[:, 6:7] = nan_imputer.fit_transform(newFeatures[:, 6:7])

# nan for a numerical variables
nan_num_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
newFeatures[:, 2:3] = nan_num_imputer.fit_transform(newFeatures[:, 2:3])
newFeatures[:, 5:6] = nan_num_imputer.fit_transform(newFeatures[:, 5:6])
newFeatures[:, 7:8] = nan_num_imputer.fit_transform(newFeatures[:, 7:8])

## Encoding categorical data

### Encoding the Dependent Variable (Gender)

In [27]:
labelEncoder = LabelEncoder()
newFeatures[:,0]= labelEncoder.fit_transform(newFeatures[:,0])

### Encoding the Dependent Variable (Ever_Married)

In [28]:
labelEncoder = LabelEncoder()
newFeatures[:,1]= labelEncoder.fit_transform(newFeatures[:,1])


### Encoding the Dependent Variable (Graduated)

In [29]:
labelEncoder = LabelEncoder()
newFeatures[:,3]= labelEncoder.fit_transform(newFeatures[:,3])


### Encoding the Dependent Variable(Spending_Score)

In [30]:
for observation in newFeatures:
    observation[6] = Spending_Score_Map[observation[6]]
labelEncoder = LabelEncoder()
newFeatures[:,6]= labelEncoder.fit_transform(newFeatures[:,6])

### Encoding the Independent Variable (Profession)

In [31]:
columnTransformer = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [4])], remainder='passthrough')
newFeatures = np.array(columnTransformer.fit_transform(newFeatures))

<h3>Feature Scaling</h3>

In [32]:
sc = StandardScaler()
newFeatures[:, 11:12] = sc.fit_transform(newFeatures[:, 11:12])
newFeatures[:, 14:15] = sc.fit_transform(newFeatures[:, 14:15])
newFeatures[:, 15:16] = sc.fit_transform(newFeatures[:, 15:16])

## Predicting the newCustomer set results Using Decision Tree Classifiers

In [33]:
newSegmentations_pred = decisionTreeClassifier.predict(newFeatures)

data_distributed = {'A':0,'B':0,'C':0,'D':0} 
for i in newSegmentations_pred: 
    if(i==0): 
        data_distributed['A'] +=1 
    if(i==1): 
        data_distributed['B'] +=1  
    if(i==2): 
        data_distributed['C'] +=1  
    if(i==3): 
        data_distributed['D'] +=1  
print(data_distributed)

{'A': 0, 'B': 0, 'C': 0, 'D': 0}


## Predicting the newCustomer set results Using Random Forest classifiers

In [34]:
newSegmentations_pred = randomForestClassifier.predict(newFeatures)
data_distributed = {'A':0,'B':0,'C':0,'D':0} 
for i in newSegmentations_pred: 
    if(i==0): 
        data_distributed['A'] +=1 
    if(i==1): 
        data_distributed['B'] +=1  
    if(i==2): 
        data_distributed['C'] +=1  
    if(i==3): 
        data_distributed['D'] +=1  
print(data_distributed)

{'A': 0, 'B': 0, 'C': 0, 'D': 0}
