# Creating a decision tree 

In [10]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import tree

In [11]:
#Read CSV file
sales = pd.read_csv("..\Python\customer_order_product.csv")
sales

Unnamed: 0,customer_id_x,customer_name,gender,age,home_address,zip_code,city,state,country,order_id,...,product_type,product_name,size,colour,price,quantity,sales,year_order,month_order,day_order
0,1,Leanna Busson,Female,30,8606 Victoria TerraceSuite 560,5464,Johnstonhaven,Northern Territory,Australia,1,...,Shirt,Oxford Cloth,XS,red,114,66,7524,2021,8,30
1,2,Zabrina Harrowsmith,Genderfluid,69,8327 Kirlin SummitApt. 461,8223,New Zacharyfort,South Australia,Australia,2,...,Shirt,Oxford Cloth,S,red,114,53,6042,2021,2,3
2,3,Shina Dullaghan,Polygender,59,269 Gemma SummitSuite 109,5661,Aliburgh,Australian Capital Territory,Australia,3,...,Shirt,Oxford Cloth,M,red,114,54,6156,2021,10,8
3,4,Hewet McVitie,Bigender,67,743 Bailey GroveSuite 141,1729,South Justinhaven,Queensland,Australia,4,...,Shirt,Oxford Cloth,L,red,114,69,7866,2021,5,6
4,5,Rubia Ashleigh,Polygender,30,48 Hyatt ManorSuite 375,4032,Griffithsshire,Queensland,Australia,5,...,Shirt,Oxford Cloth,XL,red,114,47,5358,2021,3,23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,Elvira Sarfati,Agender,59,0433 Armstrong HillSuite 974,7613,Lake Danielland,Tasmania,Australia,996,...,Trousers,Wool,XS,green,111,73,8103,2021,1,13
996,997,Dickie Grushin,Non-binary,30,04 Howell PassSuite 209,6950,Ellaborough,Tasmania,Australia,997,...,Trousers,Wool,S,green,111,43,4773,2021,1,18
997,998,Rebecka Fabler,Polygender,32,72 Annabelle PassApt. 446,52,Kohlerberg,Queensland,Australia,998,...,Trousers,Wool,M,green,111,41,4551,2021,5,5
998,999,Carita Vynarde,Polygender,30,170 Wilson AvenueApt. 577,7849,East Oscarfurt,Western Australia,Australia,999,...,Trousers,Wool,L,green,111,42,4662,2021,6,15


In [12]:

# create a dictionary of replacement values
replace_dict = {'Non-binary': 'Other', 'Polygender': 'Other', 'Agender':'Other', 'Bigender':'Other', 'Genderfluid':'Other', 'Genderqueer':'Other'}
# use replace() with the dictionary to perform the replacements
sales['gender'] = sales['gender'].replace(replace_dict)
# display the updated DataFrame
print("Updated DataFrame:")
print(sales)

Updated DataFrame:
     customer_id_x        customer_name  gender  age  \
0                1        Leanna Busson  Female   30   
1                2  Zabrina Harrowsmith   Other   69   
2                3      Shina Dullaghan   Other   59   
3                4        Hewet McVitie   Other   67   
4                5       Rubia Ashleigh   Other   30   
..             ...                  ...     ...  ...   
995            996       Elvira Sarfati   Other   59   
996            997       Dickie Grushin   Other   30   
997            998       Rebecka Fabler   Other   32   
998            999       Carita Vynarde   Other   30   
999           1000     Mandel Fairbanks    Male   71   

                       home_address  zip_code                city  \
0    8606 Victoria TerraceSuite 560      5464       Johnstonhaven   
1        8327 Kirlin SummitApt. 461      8223     New Zacharyfort   
2         269 Gemma SummitSuite 109      5661            Aliburgh   
3         743 Bailey GroveSuite 

In [13]:

# Use factorize to assign a unique integer to each unique value in the 'Gender' column
sales['gender'] = pd.factorize(sales['gender'])[0]

print(sales)



     customer_id_x        customer_name  gender  age  \
0                1        Leanna Busson       0   30   
1                2  Zabrina Harrowsmith       1   69   
2                3      Shina Dullaghan       1   59   
3                4        Hewet McVitie       1   67   
4                5       Rubia Ashleigh       1   30   
..             ...                  ...     ...  ...   
995            996       Elvira Sarfati       1   59   
996            997       Dickie Grushin       1   30   
997            998       Rebecka Fabler       1   32   
998            999       Carita Vynarde       1   30   
999           1000     Mandel Fairbanks       2   71   

                       home_address  zip_code                city  \
0    8606 Victoria TerraceSuite 560      5464       Johnstonhaven   
1        8327 Kirlin SummitApt. 461      8223     New Zacharyfort   
2         269 Gemma SummitSuite 109      5661            Aliburgh   
3         743 Bailey GroveSuite 141      1729   Sou

In [14]:
# Use factorize to assign a unique integer to each unique value in the 'Gender' column
sales['state'] = pd.factorize(sales['state'])[0]

print(sales)


     customer_id_x        customer_name  gender  age  \
0                1        Leanna Busson       0   30   
1                2  Zabrina Harrowsmith       1   69   
2                3      Shina Dullaghan       1   59   
3                4        Hewet McVitie       1   67   
4                5       Rubia Ashleigh       1   30   
..             ...                  ...     ...  ...   
995            996       Elvira Sarfati       1   59   
996            997       Dickie Grushin       1   30   
997            998       Rebecka Fabler       1   32   
998            999       Carita Vynarde       1   30   
999           1000     Mandel Fairbanks       2   71   

                       home_address  zip_code                city  state  \
0    8606 Victoria TerraceSuite 560      5464       Johnstonhaven      0   
1        8327 Kirlin SummitApt. 461      8223     New Zacharyfort      1   
2         269 Gemma SummitSuite 109      5661            Aliburgh      2   
3         743 Bailey Gr

In [15]:
#Create input set
X = sales.drop(columns=['customer_id_x','customer_name', 'home_address', 'zip_code', 'city', 'country', 'order_id', 'product_type', 'product_name', 'size','colour', 'price', 'quantity', 'sales', 'year_order', 'month_order', 'day_order', 'customer_id_y', 'payment','order_date', 'delivery_date', 'product_id'])
X


Unnamed: 0,gender,age,state
0,0,30,0
1,1,69,1
2,1,59,2
3,1,67,3
4,1,30,3
...,...,...,...
995,1,59,7
996,1,30,7
997,1,32,3
998,1,30,5


In [16]:
#Create output set
y = sales['product_type']
y

0         Shirt
1         Shirt
2         Shirt
3         Shirt
4         Shirt
         ...   
995    Trousers
996    Trousers
997    Trousers
998    Trousers
999    Trousers
Name: product_type, Length: 1000, dtype: object

In [17]:
# Split dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create, train the model and make a prediction
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
#[ [3, 80] , [8, 71]]
# Measure accuracy of the model
score = accuracy_score(y_test, predictions)
score

0.455

In [19]:
tree.export_graphviz(model, out_file='product-recommender1.dot',
                    feature_names=['age', 'gender','state'],
                    class_names=sorted(y.unique()),
                    label='all',
                    rounded=True,
                    filled=True)

In [9]:
#Open decision tree file .dot with VS code
#The acuracy would be higher if we would have more data