# The Dollar Tree Website

## Multi-class classification

The data set is about the sales of items on the Dollar Tree Website. 
The task is to train one (1) classification algorithm (K-Nearest Neighbours, Stochastic Gradient Descent, Naïve Bayes, etc.) to predict the product category based on the other features. 

You may drop whatever columns you feel are not important for this training. Note that you will have to justify this during your presentation.

     * The project focuses on the data preparation, modeling, and evaluation phase of the CRISP-DM. 

In [None]:
# Import the necessary libraries
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import seaborn as sns

# Import warnings
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
# Load the data
data = pd.read_csv(r"C:\Users\nanay\Downloads\dollar_tree-231012-200322.csv")
data

## 1. Data Understanding and Preparation

In [None]:
# Check the info 
data.info()

In [None]:
# Convert date to datetime type
data['Date'] = pd.to_datetime(data['Date'])

# Check the dtype
data.info()

In [None]:
# List all elements in the columns

for i, column in enumerate(data.columns):
    print(f'Column-{i + 1}: {column}\n', data[column].unique())
    print('='* 70)

In [None]:
# Check for missing values
data.isna().sum()

In [None]:
# Drop Columns you do not need
data= data. drop(['index','Column1'], axis = 1)

data

In [None]:
# drop missing values
data = data.dropna()
data

In [None]:
# Check for duplicated values
data.duplicated().sum()

In [None]:
# Show the duplicated value

data.loc[data.duplicated()]

In [None]:
# Drop the duplicate
data.drop_duplicates(inplace = True)


In [None]:
# Descriptive statistics for numerical columns
data.describe().T

In [None]:
# Descriptive statistics for categorical columns 
data.describe(include="object").T

In [None]:
# Check for the numerical columns
numeric_columns = data.select_dtypes('number').columns
numeric_columns

In [None]:
# Check for the correlation of the continuous variables
correlation = data[numeric_columns].corr()
correlation

In [None]:
# Check for the correlation of the continuous variables using Heatmap
plt.figure(figsize=(10, 8)) 
sns.heatmap(correlation, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Heatmap of Numeric Features', fontsize=17)
plt.show()

In [None]:
# Describe the continuous variables
plt.figure(figsize=(14, 5)) 
sns.boxplot(data,orient='h')
plt.title('Boxplot of Features', fontsize=18)
plt.show()

In [None]:
# Box plot for just age
plt.figure(figsize=(14, 5)) 
sns.boxplot(data['Customer Age'],orient='h')
plt.title('Boxplot of Features', fontsize=18)
plt.show()

In [None]:
# Check for the categorical columns 
categorical_columns = data.select_dtypes('object').columns
categorical_columns

In [None]:
# Change M & F
data['Customer Gender'] = data['Customer Gender'].replace('M', 'Male')
data['Customer Gender'] = data['Customer Gender'].replace('F', 'Female')
data

#### **Answering Analytical Questions**

**1. Contribution to Revenue by Product Category**

In [None]:
plt.figure(figsize=(10, 6))
sns.violinplot(data=data, x='Product Category', y='Revenue',color='skyblue')
plt.title('Violinplot using Product Category and Revenue', fontsize=15)
plt.show()

**2. Which Product and Sub Category has the highest counts?**

In [None]:
pro_sub = data.groupby('Product Category')['Sub Category'].value_counts().rename('Breakdown')
pro_sub = pro_sub.reset_index()
pro_sub

In [None]:
# Plotting
plt.figure(figsize=(10, 6))
sns.countplot(data=data, x='Product Category', hue='Sub Category', palette="Set2")
plt.legend(loc='best')
plt.show()

**3. What are the percentages for Product Category?**

In [None]:
plt.figure(figsize=(10, 8))
data['Product Category'].value_counts().plot.pie(startangle=90, explode=(0.01, 0.03, 0.05), colors=['magenta','brown','yellow'], autopct='%.1f%%')
plt.title('Product Category Percentages', fontsize=15)
plt.show()

**4. What are the percentages for Males and Females?**

In [None]:
plt.figure(figsize=(10, 8))
data['Customer Gender'].value_counts().plot.pie(startangle=90, explode=(0.01, 0.05), colors=['pink','green'], autopct='%.1f%%')
plt.title('Customer Gender Percentages', fontsize=15)
plt.show()

**5. How many countries do we have?**

In [None]:
plt.figure(figsize=(10, 8))
data['Country'].value_counts().plot.bar(cmap=plt.get_cmap('gist_earth'))
plt.title('Country', fontsize=15)
plt.legend()
plt.show()

#### **Split Dataset into Training and Evaluation**

- X = input_data
- y = label

Since this is a `Multi-Class Classification ( If the number of class is more than two)`

In [None]:
from sklearn.model_selection import train_test_split

# Select x
X = data.drop('Product Category', axis=1)
X

In [None]:
# Select y
y = data['Product Category']
y

In [None]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,random_state=42, stratify=y)

## Terms to take note.

''' Simple imputer -- is used for imputing missing values in datasets. 
It replaces missing values with a specified strategy, such as mean, median, most frequent value, or constant value.

OneHotEncoder --  is used for converting categorical data into numerical data. It transforms categorical variables into binary vectors, 
where each column corresponds to a unique category and is marked as 1 if the sample belongs to that category, and 0 otherwise.


RobustScaler --  used for scaling features according to the quantile range, making it robust to outliers.
(useful when dealing with datasets that have outliers.) 

FunctionTransformer: This class allows you to apply a custom function to transform data within a scikit-learn pipeline. 
It’s useful for applying custom transformations that are not directly available in scikit-learn.

Pipeline -- used to chain multiple transformers and estimators together into a single unit when preprocessing data and building machine learning models. 


ColumnTransformer -- used to apply different transformations to different columns
(a mix of numerical and categorical features that require different preprocessing steps)

classification_report --  computes a classification report, which includes precision, recall, F1-score, and support for each class.
For evaluating the performance of a classification model.


In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import SGDClassifier

'''Label encoder --  categorical labels into numerical labels. Each unique category in the column is assigned an integer value
(Change the product category from class to numbers)'''

l_encoder = LabelEncoder()
y_train_encoded = l_encoder.fit_transform(y_train)
y_test_encoded = l_encoder.transform(y_test)

In [None]:
# Create a Data Preprocessing Pipeline for all your numeric data
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('log_tranformation', FunctionTransformer(np.log1p)),
    ('scaler', RobustScaler()),
])

# Create a Data Pre Processing Pipeline for all your categorical Data
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('mapping', OneHotEncoder(sparse_output=False, drop='first')) 
])

In [None]:
# Create a list of the attributes of the train data
num_attri = X_train.select_dtypes('number').columns

# Create a list of the categorical variables of the train data
cat_attri = X_train.select_dtypes('object').columns

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_attri),
    ('cat', cat_pipeline, cat_attri)
    ])        

preprocessor

In [None]:
X_train.isna().sum()

## Model 1

In [None]:

# import
from sklearn.naive_bayes import GaussianNB

# Fit model to training data
naive_bayes_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier',  GaussianNB())
]) 

# Train data
naive_bayes_pipeline.fit(X_train, y_train_encoded)

# Predict
y_pred = naive_bayes_pipeline.predict(X_test)

# View report & print
report = classification_report(y_test_encoded, y_pred)

print(report)

In [None]:
# View pipeline
naive_bayes_pipeline

## Model 2

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize model

knn_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier',  KNeighborsClassifier())
]) 

# Fit model to training data
knn_pipeline.fit(X_train, y_train_encoded)

# Predict
y_pred = knn_pipeline.predict(X_test)


report = classification_report(y_test_encoded, y_pred)

print(report)

In [None]:
# View pipeline
knn_pipeline

## Model 3

In [None]:
# Fit model to training data
sgd_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier',  SGDClassifier())
]) 

sgd_pipeline.fit(X_train, y_train_encoded)

y_pred = sgd_pipeline.predict(X_test)

report = classification_report(y_test_encoded, y_pred)

print(report)

In [None]:
# View pipeline
sgd_pipeline

In [None]:
# Show the label classes
l_encoder.classes_

## Model 4

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize model
# Fit model to training data
forest_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier',  RandomForestClassifier(random_state=0, n_estimators=20))
]) 

forest_pipeline.fit(X_train, y_train_encoded)

y_pred = forest_pipeline.predict(X_test)

report = classification_report(y_test_encoded, y_pred)

print(report)

In [None]:
# View pipeline
forest_pipeline

In [None]:
# import 
from sklearn.metrics import confusion_matrix

# Generate the confusion matrix
conf_matrix = confusion_matrix(y_test_encoded, y_pred)

# Plot the confusion matrix
plt.figure(figsize=(12, 6))
sns.heatmap(conf_matrix, annot=True, cmap='plasma', fmt='d', cbar=True)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()