# Analysis of Motorcycles Sales Data

## Data Transformation

In [282]:
import pandas as pd

In [283]:
sales = pd.read_csv("data/sales_data.csv")
sales.head()

Unnamed: 0,date,warehouse,client_type,product_line,quantity,unit_price,total,payment
0,1/6/2021,Central,Retail,Miscellaneous,8,16.85,134.83,Credit card
1,1/6/2021,North,Retail,Breaking system,9,19.29,173.61,Cash
2,1/6/2021,North,Retail,Suspension & traction,8,32.93,263.45,Credit card
3,1/6/2021,North,Wholesale,Frame & body,16,37.84,605.44,Transfer
4,1/6/2021,Central,Retail,Engine,2,60.48,120.96,Credit card


In [284]:
# Convert the 'date' column to a datetime object
sales['date'] = pd.to_datetime(sales['date'])

# Set the 'date' column as the index of the DataFrame
#sales = sales.set_index('date')


Parsing dates in DD/MM/YYYY format when dayfirst=False (the default) was specified. This may lead to inconsistently parsed dates! Specify a format to ensure consistent parsing.



In [285]:
from pandas.api.types import CategoricalDtype

cats = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

cat_type = CategoricalDtype(categories=cats, ordered=True)

# Create new columns for the weekday and month
sales['Weekday'] = sales['date'].dt.day_name().astype(cat_type)
sales['Month'] = sales['date'].dt.month_name()

## Exploratory Data Analysis

### Payment Method

In [286]:
import plotly.express as px

# Plot the DataFrame
payment_method = px.bar(sales.groupby('payment').count().reset_index().
                        assign(percentage=lambda x: (x['date'] / x['date'].sum()) * 100).
                        loc[:,["payment","percentage"]].sort_values(by='percentage', ascending=True), 
                        color="payment", x='payment', y='percentage', title='Number of Transactions by Payment Method', 
                        labels={'payment': 'Payment','percentage': 'Number of Transactions(%)'})

# Sort the bars from lowest to highest
payment_method.show()

### Daily Revenue

In [287]:
# Plot the DataFrame
daily_revenue = px.line(sales.groupby('date').sum().reset_index().loc[:,["date","total"]], 
                        x='date', y='total', title='Daily Revenue',
                        labels={'date': 'Date','total': 'Daily Revenue'})

# Show the plot
daily_revenue.show()


The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



### Days of the week with higher revenue

In [288]:
import plotly.express as px

# Plot the DataFrame
weekday_trans = px.bar(sales.groupby('Weekday').count().reset_index().
                        assign(percentage=lambda x: (x['date'] / x['date'].sum()) * 100).
                        loc[:,["Weekday","percentage"]].sort_values(by="Weekday").reset_index(drop=True),
                        color="Weekday", x='Weekday', y='percentage', title='Number of Transactions In each Day of the Week', 
                        labels={'Weekday': 'Day of the Week','percentage': 'Number of Transactions(%)'})

# Sort the bars from lowest to highest
weekday_trans.show()

### Product Line with most Number of Transactions

In [289]:
import plotly.express as px

# Plot the DataFrame
product_line_trans = px.bar(sales.groupby('product_line').count().reset_index().
                        assign(percentage=lambda x: (x['date'] / x['date'].sum()) * 100).
                        loc[:,["product_line","percentage"]].sort_values(by="percentage").reset_index(drop=True),
                        color="product_line", x='product_line', y='percentage', title='Number of Transactions by Product Line', 
                        labels={'product_line': 'Products Category','percentage': 'Number of Transactions(%)'})

# Sort the bars from lowest to highest
product_line_trans.show()

### Product Category with most sales

In [290]:
import plotly.express as px

# Plot the DataFrame
cat_most_sales = px.bar(sales.groupby('product_line').sum().reset_index().
                        loc[:,["product_line","total"]].sort_values(by="total", ascending=False),
                        color="product_line", y='product_line', x='total', title='Revenue by Product Line', 
                        labels={'product_line': 'Products Category','total': 'Revenue($)'})

cat_most_sales.update_layout(xaxis_tickprefix = '$', xaxis_tickformat = ',.')

# Sort the bars from lowest to highest
cat_most_sales.show()


The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



In [291]:
sales.head()

Unnamed: 0,date,warehouse,client_type,product_line,quantity,unit_price,total,payment,Weekday,Month
0,2021-01-06,Central,Retail,Miscellaneous,8,16.85,134.83,Credit card,Wednesday,January
1,2021-01-06,North,Retail,Breaking system,9,19.29,173.61,Cash,Wednesday,January
2,2021-01-06,North,Retail,Suspension & traction,8,32.93,263.45,Credit card,Wednesday,January
3,2021-01-06,North,Wholesale,Frame & body,16,37.84,605.44,Transfer,Wednesday,January
4,2021-01-06,Central,Retail,Engine,2,60.48,120.96,Credit card,Wednesday,January


### Warehouse With the Highest Revenue

In [292]:
# Plot the DataFrame
wh_most_sales = px.bar(sales.groupby('warehouse').sum().reset_index().
                        loc[:,["warehouse","total"]].sort_values(by="total", ascending=False),
                        color="warehouse", x='warehouse', y='total', title='Warehouse With the Highest Revenue', 
                        labels={'warehouse': 'Warehouse','total': 'Revenue($)'})

wh_most_sales.update_layout(yaxis_tickprefix = '$', yaxis_tickformat = ',.')

# Sort the bars from lowest to highest
wh_most_sales.show()


The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



In [293]:
sales.head()

Unnamed: 0,date,warehouse,client_type,product_line,quantity,unit_price,total,payment,Weekday,Month
0,2021-01-06,Central,Retail,Miscellaneous,8,16.85,134.83,Credit card,Wednesday,January
1,2021-01-06,North,Retail,Breaking system,9,19.29,173.61,Cash,Wednesday,January
2,2021-01-06,North,Retail,Suspension & traction,8,32.93,263.45,Credit card,Wednesday,January
3,2021-01-06,North,Wholesale,Frame & body,16,37.84,605.44,Transfer,Wednesday,January
4,2021-01-06,Central,Retail,Engine,2,60.48,120.96,Credit card,Wednesday,January


## Classfication Model

In [294]:
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier

In [295]:


# Split the data into predictors (X) and target variable (y)
X = sales[['warehouse', 'client_type', 'product_line', 'Weekday', 'Month', 'quantity', 'unit_price']]
y = sales['payment']

# Perform one-hot encoding on categorical variables
cat_features = ['warehouse', 'client_type', 'product_line', 'Weekday', 'Month']
for feature in cat_features:
    encoder = OneHotEncoder(sparse=False)
    encoded_features = encoder.fit_transform(X[[feature]])
    # Create new column names for the encoded features
    feature_names = [feature + "_" + str(i) for i in range(encoded_features.shape[1])]
    # Create a DataFrame from the encoded features
    encoded_df = pd.DataFrame(encoded_features, columns=feature_names)
    # Concatenate the encoded features to the original data
    X = pd.concat([X, encoded_df], axis=1)
    # Drop the original categorical feature
    X.drop(feature, axis=1, inplace=True)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform oversampling using SMOTE
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Initialize the PolynomialFeatures transformer
poly_features = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

# Fit and transform the training data
X_train_poly = poly_features.fit_transform(X_train)

# Transform the test data
X_test_poly = poly_features.transform(X_test)

# Initialize the CatBoost classifier
model = CatBoostClassifier(iterations=100, learning_rate=0.1, random_seed=42)

# Fit the model to the training data
model.fit(X_train_poly, y_train, verbose=False)

# Make predictions on the test data
y_pred = model.predict(X_test_poly)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Cross-validation
cv_scores = cross_val_score(model, X_train_poly, y_train, cv=5)
print("Cross-validation scores:", cv_scores)



`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.


`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.


`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.


`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.


`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.



Accuracy: 0.88
0:	learn: 0.9819056	total: 49.5ms	remaining: 4.9s
1:	learn: 0.8961151	total: 90.3ms	remaining: 4.42s
2:	learn: 0.8314819	total: 133ms	remaining: 4.29s
3:	learn: 0.7664150	total: 176ms	remaining: 4.23s
4:	learn: 0.7131015	total: 218ms	remaining: 4.15s
5:	learn: 0.6711457	total: 260ms	remaining: 4.07s
6:	learn: 0.6297582	total: 274ms	remaining: 3.65s
7:	learn: 0.5870408	total: 316ms	remaining: 3.63s
8:	learn: 0.5528230	total: 358ms	remaining: 3.62s
9:	learn: 0.5170212	total: 398ms	remaining: 3.58s
10:	learn: 0.4901701	total: 438ms	remaining: 3.54s
11:	learn: 0.4649309	total: 479ms	remaining: 3.51s
12:	learn: 0.4425427	total: 521ms	remaining: 3.49s
13:	learn: 0.4210410	total: 563ms	remaining: 3.46s
14:	learn: 0.4010670	total: 605ms	remaining: 3.43s
15:	learn: 0.3862955	total: 646ms	remaining: 3.39s
16:	learn: 0.3692293	total: 690ms	remaining: 3.37s
17:	learn: 0.3526452	total: 730ms	remaining: 3.32s
18:	learn: 0.3358182	total: 771ms	remaining: 3.29s
19:	learn: 0.3185112	tota

In [296]:
y_pred

array([['Transfer'],
       ['Transfer'],
       ['Credit card'],
       ['Transfer'],
       ['Transfer'],
       ['Transfer'],
       ['Credit card'],
       ['Credit card'],
       ['Credit card'],
       ['Transfer'],
       ['Credit card'],
       ['Transfer'],
       ['Transfer'],
       ['Credit card'],
       ['Credit card'],
       ['Credit card'],
       ['Transfer'],
       ['Credit card'],
       ['Transfer'],
       ['Credit card'],
       ['Credit card'],
       ['Transfer'],
       ['Credit card'],
       ['Credit card'],
       ['Credit card'],
       ['Credit card'],
       ['Transfer'],
       ['Credit card'],
       ['Cash'],
       ['Credit card'],
       ['Credit card'],
       ['Credit card'],
       ['Credit card'],
       ['Transfer'],
       ['Transfer'],
       ['Credit card'],
       ['Credit card'],
       ['Credit card'],
       ['Transfer'],
       ['Credit card'],
       ['Credit card'],
       ['Credit card'],
       ['Credit card'],
       ['Credit card

In [300]:
sales.head()

Unnamed: 0,date,warehouse,client_type,product_line,quantity,unit_price,total,payment,Weekday,Month
0,2021-01-06,Central,Retail,Miscellaneous,8,16.85,134.83,Credit card,Wednesday,January
1,2021-01-06,North,Retail,Breaking system,9,19.29,173.61,Cash,Wednesday,January
2,2021-01-06,North,Retail,Suspension & traction,8,32.93,263.45,Credit card,Wednesday,January
3,2021-01-06,North,Wholesale,Frame & body,16,37.84,605.44,Transfer,Wednesday,January
4,2021-01-06,Central,Retail,Engine,2,60.48,120.96,Credit card,Wednesday,January


In [307]:
# Example usage:
sample_data = pd.DataFrame({
    'warehouse': ["North"],
    'client_type': ["Retail"],
    'product_line': ["Breaking system"],
    'Weekday': ['Wednesday'],
    'Month': ['January'],
    'quantity': [9],
    'unit_price': [19.29]
})

prediction = transform_and_predict(sample_data)
print("Prediction:", prediction[0][0])


Parsing dates in DD/MM/YYYY format when dayfirst=False (the default) was specified. This may lead to inconsistently parsed dates! Specify a format to ensure consistent parsing.


`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.


`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.


`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.


`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.


`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.



Prediction: Cash


In [303]:
def transform_and_predict(sample_data):
    # Load the original dataset
    sales = pd.read_csv("data/sales_data.csv")  # Replace 'your_dataset.csv' with the actual filename
    
    # Convert 'date' column to datetime type
    sales['date'] = pd.to_datetime(sales['date'])
    
    from pandas.api.types import CategoricalDtype

    cats = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

    cat_type = CategoricalDtype(categories=cats, ordered=True)

    # Create new columns for the weekday and month
    sales['Weekday'] = sales['date'].dt.day_name().astype(cat_type)
    sales['Month'] = sales['date'].dt.month_name()

    # Split the data into predictors (X) and target variable (y)
    X = sales[['warehouse', 'client_type', 'product_line', 'Weekday', 'Month', 'quantity', 'unit_price']]
    y = sales['payment']
    
    # Perform one-hot encoding on categorical variables
    cat_features = ['warehouse', 'client_type', 'product_line', 'Weekday', 'Month']
    encoders = {}  # Dictionary to store encoders for each feature
    
    for feature in cat_features:
        encoder = OneHotEncoder(sparse=False)
        encoded_features = encoder.fit_transform(X[[feature]])
        # Create new column names for the encoded features
        feature_names = [feature + "_" + str(i) for i in range(encoded_features.shape[1])]
        # Create a DataFrame from the encoded features
        encoded_df = pd.DataFrame(encoded_features, columns=feature_names)
        # Concatenate the encoded features to the original data
        X = pd.concat([X, encoded_df], axis=1)
        # Drop the original categorical feature
        X.drop(feature, axis=1, inplace=True)
        
        # Store the encoder for future use
        encoders[feature] = encoder
    
    # Perform oversampling using SMOTE
    smote = SMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X, y)
    
    # Initialize the PolynomialFeatures transformer
    poly_features = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
    
    # Fit and transform the training data
    X_train_poly = poly_features.fit_transform(X_train)

    # Initialize the CatBoost classifier
    model = CatBoostClassifier(iterations=100, learning_rate=0.1, random_seed=42)
    
    # Fit the model to the training data
    model.fit(X_train_poly, y_train, verbose=False)
    
    # Transform the sample data
    sample_data_encoded = sample_data.copy()
    for feature in cat_features:
        encoder = encoders[feature]  # Retrieve the corresponding encoder
        encoded_features = encoder.transform(sample_data_encoded[[feature]])
        feature_names = [feature + "_" + str(i) for i in range(encoded_features.shape[1])]
        encoded_df = pd.DataFrame(encoded_features, columns=feature_names)
        sample_data_encoded = pd.concat([sample_data_encoded, encoded_df], axis=1)
        sample_data_encoded.drop(feature, axis=1, inplace=True)
        
    sample_data_poly = poly_features.transform(sample_data_encoded)
    
    
    # Make predictions on the sample data
    y_pred = model.predict(sample_data_poly)
    
    return y_pred

# Example usage:
sample_data = pd.DataFrame({
    'warehouse': ["North"],
    'client_type': ["Retail"],
    'product_line': ["Engine"],
    'Weekday': ['Monday'],
    'Month': ['February'],
    'quantity': [10],
    'unit_price': [5.0]
})

prediction = transform_and_predict(sample_data)
print("Prediction:", prediction)



Parsing dates in DD/MM/YYYY format when dayfirst=False (the default) was specified. This may lead to inconsistently parsed dates! Specify a format to ensure consistent parsing.


`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.


`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.


`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.


`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.


`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.



Prediction: [['Credit card']]
