# Post Grad Independent Project - Sales Data Analyzing & Predicting

#### Part I: Importing the libraries & pulling in the dataset:

In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import ConfusionMatrixDisplay

In [55]:
df = pd.read_csv("data/retail_sales_dataset.csv")

In [56]:
# Project Goal: Predict whether the buyer is male or female based onf actors like category, age, price per unit, and total order amount.

#### Part II: Cleaning The Dataset:

In [57]:
pd.set_option('display.max_columns', None)

df.head()

Unnamed: 0,Transaction ID,Date,Customer ID,Gender,Age,Product Category,Quantity,Price per Unit,Total Amount
0,1,2023-11-24,CUST001,Male,34,Beauty,3,50,150
1,2,2023-02-27,CUST002,Female,26,Clothing,2,500,1000
2,3,2023-01-13,CUST003,Male,50,Electronics,1,30,30
3,4,2023-05-21,CUST004,Male,37,Clothing,1,500,500
4,5,2023-05-06,CUST005,Male,30,Beauty,2,50,100


In [58]:
df.shape

(1000, 9)

In [59]:
df['Gender'].value_counts()

Gender
Female    510
Male      490
Name: count, dtype: int64

In [60]:
# Value counts on the target lets me know that there are 510 females and 490 males in the dataset.

In [61]:
df.dtypes

Transaction ID       int64
Date                object
Customer ID         object
Gender              object
Age                  int64
Product Category    object
Quantity             int64
Price per Unit       int64
Total Amount         int64
dtype: object

In [62]:
# The dtypes show me that the target (gender) is listed as an object so we either need to map it to 1s and 2s for the matrix or list it as a boolean. 
# Catergory we may want to change to numerical for the model (Numbers rather than the options). 
# Date we may want to change to date / time.

In [63]:
df['Product Category'].value_counts()

Product Category
Clothing       351
Electronics    342
Beauty         307
Name: count, dtype: int64

In [64]:
# Define the mapping
category_mapping = {'Clothing': 1, 'Electronics': 2, 'Beauty': 3}

# Map the 'Product Category' column to numbers
df['Product Category'] = df['Product Category'].map(category_mapping)

# Convert the 'Product Category' column to integers
df['Product Category'] = df['Product Category'].astype(int)

In [65]:
# ^ Changed to int and 1 2 3 vs the categories

In [66]:
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
# ^ Changed to date / time rather than object

In [67]:
df.isnull().sum()

Transaction ID      0
Date                0
Customer ID         0
Gender              0
Age                 0
Product Category    0
Quantity            0
Price per Unit      0
Total Amount        0
dtype: int64

In [None]:
# ^ This tells me there are no nulls to have to take are