In [4]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier

In [5]:
data = pd.read_csv("https://raw.githubusercontent.com/benvictoria17/MachineLearning/master/dataset/SUV%20Purchase%20Decision/SUV_Purchase.csv")

In [6]:
data

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [8]:
def predict_on_raw_data(df, target, task):
    df = df.copy()
    
    df = df.drop('User ID', axis=1)
    
    y = df[target]
    X = df.drop(target, axis=1)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    binary_encoder = Pipeline(steps=[
        ('function', FunctionTransformer(lambda column: column.replace({'Female': 0, 'Male': 1})))
    ])
    
    preprocessor = ColumnTransformer(transformers=[
        ('binary', binary_encoder, ['Gender'])
    ], remainder='passthrough')
    
    if target == 'Gender':
        model = DecisionTreeRegressor() if task == 'regression' else DecisionTreeClassifier()
    else:
        model = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('mod', DecisionTreeRegressor() if task == 'regression' else DecisionTreeClassifier())
        ])
    
    model.fit(X_train, y_train)
    
    result = model.score(X_test, y_test)
    
    return result

In [9]:
gender_acc = predict_on_raw_data(data, target='Gender', task='classification')
purchased_acc = predict_on_raw_data(data, target='Purchased', task='classification')

age_r2 = predict_on_raw_data(data, target='Age', task='regression')
salary_r2 = predict_on_raw_data(data, target='EstimatedSalary', task='regression')
print("Target: Gender (Accuracy): {:.2f}%".format(gender_acc * 100))
print("Target: Purchased (Accuracy): {:.2f}%".format(purchased_acc * 100))
print("Target: Age (R^2): {:.4f}".format(age_r2))
print("Target: EstimatedSalary (R^2): {:.4f}".format(salary_r2))

Target: Gender (Accuracy): 47.50%
Target: Purchased (Accuracy): 83.33%
Target: Age (R^2): 0.3104
Target: EstimatedSalary (R^2): -0.3353
