In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
# Load data from CSV files
customers = pd.read_csv('../Data/customers.csv')
orders = pd.read_csv('../Data/orders.csv')
products = pd.read_csv('../Data/products.csv')
sales = pd.read_csv('../Data/sales.csv')

In [3]:
# Join dataframes to create a combined dataset
sales = sales.merge(orders, on='order_id').merge(customers, on='customer_id').merge(products, on='product_id')

In [7]:
# Perform feature engineering
sales['days_to_delivery'] = (pd.to_datetime(sales['delivery_date']) - pd.to_datetime(sales['order_date'])).dt.days
sales['total_spent'] = sales['total_price'] / sales['quantity_x']
customer_features = sales.groupby('customer_id').agg({
    'total_spent': 'mean',
    'quantity_x': 'sum',
    'days_to_delivery': 'mean',
    'gender': lambda x: x.mode()[0],
    'age': lambda x: x.mode()[0],
    'home_address': lambda x: x.mode()[0],
    'zip_code': lambda x: x.mode()[0],
    'city': lambda x: x.mode()[0],
    'state': lambda x: x.mode()[0],
    'country': lambda x: x.mode()[0]
}).reset_index()


In [8]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 4999
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   sales_id          5000 non-null   int64  
 1   order_id          5000 non-null   int64  
 2   product_id        5000 non-null   int64  
 3   price_per_unit    5000 non-null   int64  
 4   quantity_x        5000 non-null   int64  
 5   total_price       5000 non-null   int64  
 6   customer_id       5000 non-null   int64  
 7   payment           5000 non-null   int64  
 8   order_date        5000 non-null   object 
 9   delivery_date     5000 non-null   object 
 10  customer_name     5000 non-null   object 
 11  gender            5000 non-null   object 
 12  age               5000 non-null   int64  
 13  home_address      5000 non-null   object 
 14  zip_code          5000 non-null   int64  
 15  city              5000 non-null   object 
 16  state             5000 non-null   object 


In [9]:
# Merge customer features back into sales dataframe
sales = sales.merge(customer_features, on='customer_id')

In [10]:
# Split dataset into training and test sets
X = sales.drop(['product_id', 'product_type', 'product_name', 'size', 'colour', 'price', 'sales_id', 'order_id', 'customer_id', 'payment', 'order_date', 'delivery_date', 'total_price'], axis=1)
y = sales['product_name']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
# Train logistic regression model
clf = LogisticRegression(random_state=42)
clf.fit(X_train, y_train)

ValueError: could not convert string to float: 'Lars Stading'