## Product Sales Prediction

Given data about products being sold, let's try to predict if a given product will have been **sold in the last six months.**

We will use a random forest classification model to make our predictions. 

Data source: https://www.kaggle.com/datasets/flenderson/sales-analysis?resource=download

### Getting Started 

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

In [2]:
data = pd.read_csv("SalesKaggle3.csv")
data

Unnamed: 0,Order,File_Type,SKU_number,SoldFlag,SoldCount,MarketingType,ReleaseNumber,New_Release_Flag,StrengthFactor,PriceReg,ReleaseYear,ItemCount,LowUserPrice,LowNetPrice
0,2,Historical,1737127,0.0,0.0,D,15,1,6.827430e+05,44.99,2015,8,28.97,31.84
1,3,Historical,3255963,0.0,0.0,D,7,1,1.016014e+06,24.81,2005,39,0.00,15.54
2,4,Historical,612701,0.0,0.0,D,0,0,3.404640e+05,46.00,2013,34,30.19,27.97
3,6,Historical,115883,1.0,1.0,D,4,1,3.340110e+05,100.00,2006,20,133.93,83.15
4,7,Historical,863939,1.0,1.0,D,2,1,1.287938e+06,121.95,2010,28,4.00,23.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198912,208023,Active,109683,,,D,7,1,2.101869e+05,72.87,2006,54,8.46,60.59
198913,208024,Active,416462,,,D,8,1,4.555041e+05,247.00,2009,65,8.40,74.85
198914,208025,Active,658242,,,S,2,1,1.692746e+05,50.00,2012,23,23.98,32.62
198915,208026,Active,2538340,,,S,2,1,3.775266e+05,46.95,2001,23,27.42,37.89


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198917 entries, 0 to 198916
Data columns (total 14 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Order             198917 non-null  int64  
 1   File_Type         198917 non-null  object 
 2   SKU_number        198917 non-null  int64  
 3   SoldFlag          75996 non-null   float64
 4   SoldCount         75996 non-null   float64
 5   MarketingType     198917 non-null  object 
 6   ReleaseNumber     198917 non-null  int64  
 7   New_Release_Flag  198917 non-null  int64  
 8   StrengthFactor    198917 non-null  float64
 9   PriceReg          198917 non-null  float64
 10  ReleaseYear       198917 non-null  int64  
 11  ItemCount         198917 non-null  int64  
 12  LowUserPrice      198917 non-null  float64
 13  LowNetPrice       198917 non-null  float64
dtypes: float64(6), int64(6), object(2)
memory usage: 21.2+ MB


### Preprocessing

In [26]:
def preprocess_inputs(df):
    df = df.copy()

    # Only use historical data
    df = df.query("File_Type == 'Historical'")

    # Drop unused columns
    df = df.drop(['File_Type', 'Order', 'SKU_number', 'SoldCount'], axis=1)

    # Shuffle Data
    df = df.sample(frac=1.0, random_state=1)

    # Split the df into X and y
    y = df['SoldFlag']
    X = df.drop('SoldFlag', axis=1)
    
    return X, y

In [27]:
X, y = preprocess_inputs(data)

In [28]:
X

Unnamed: 0,MarketingType,ReleaseNumber,New_Release_Flag,StrengthFactor,PriceReg,ReleaseYear,ItemCount,LowUserPrice,LowNetPrice
37862,S,12,1,545082.0,96.67,2011,12,73.74,101.33
35304,S,2,1,4273940.0,58.00,2002,32,85.60,23.98
26138,D,9,1,165834.0,76.95,2011,48,75.57,42.67
52327,S,22,1,79220.0,54.25,2012,31,36.47,22.49
6038,D,8,1,80014.0,38.99,2008,62,153.24,69.43
...,...,...,...,...,...,...,...,...,...
20609,D,8,1,40841.0,103.24,2010,48,99.50,115.55
21440,D,0,0,1611172.0,86.64,2011,19,55.19,78.38
73349,S,2,1,1628317.0,69.99,2004,43,4.02,30.43
50057,S,2,1,1660915.0,44.00,2004,32,34.51,10.12


In [40]:
y.value_counts(), y.mean()

(SoldFlag
 0.0    63000
 1.0    12996
 Name: count, dtype: int64,
 0.17100900047370915)

### Building Pipeline

In [59]:
def build_pipeline():
    binary_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(sparse_output=False, drop='if_binary'))
    ])

    nominal_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(transformers=[
        ('binary', binary_transformer, ['MarketingType']),
        ('nominal', nominal_transformer, ['ReleaseNumber'])
    ], remainder='passthrough')

    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestClassifier(random_state=1))
    ])

    return model

### Training/Validation

In [60]:
y[y==0.0].index

Index([37862, 35304, 26138, 52327,  6038, 12076,  8933, 36633,  2956, 47029,
       ...
       51668, 45413, 21758, 31228, 43757,  7751, 49100, 20609, 73349, 50057],
      dtype='int64', length=63000)

In [61]:
accs = []
f1s = []

kf = KFold(n_splits=5)
for train_idx, test_idx in kf.split(X):
    X_train = X.iloc[train_idx, :]
    X_test = X.iloc[test_idx, :]
    y_train = y.iloc[train_idx]
    y_test = y.iloc[test_idx]

    # Address class imbalance 
    num_samples = int(y_train.value_counts().mean())
    majority_indices = y_train[y_train==0.0].index
    # Random under sampling
    samples_to_drop = y_train[majority_indices].sample(len(y_train) - num_samples, random_state=1).index
    X_train = X_train.drop(samples_to_drop, axis=0)
    y_train = y_train.drop(samples_to_drop, axis=0)
    # Random over sampling
    oversampler = RandomOverSampler(random_state=1)
    X_train, y_train = oversampler.fit_resample(X_train, y_train)
    
    model = build_pipeline()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    accs.append(accuracy_score(y_test, y_pred))
    f1s.append(f1_score(y_test, y_pred, pos_label=1.0))

acc = np.mean(accs)
f1 = np.mean(f1s)
print("Accuracy: {:.2f}%".format(acc*100))
print("F1-Score: {:.5f}".format(f1))

Accuracy: 76.87%
F1-Score: 0.43494


In [55]:
# without over and under sampling, the results were accuracy = 83% and f1-score = 0.25

In [45]:
from sklearn.metrics import confusion_matrix

In [47]:
confusion_matrix(y_test, y_pred)

array([[12261,   325],
       [ 2181,   432]])