In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, Perceptron, RidgeClassifier, PassiveAggressiveClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_score, precision_recall_curve, recall_score, confusion_matrix, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import joblib


**TAKEAWAYS**

- Data is loaded and checked for missing columns, incorrect data types and missing values
- Data is preprocessed to filter it, encode some variables and scale
- A pipeline is built to join the steps
- GridsearchCV is used to tune hiperparameters and select the best model
- The best model is saved and stored

LOADING AND VALIDATION

In [4]:
path = '/home/alvaro/groceries/boxbuilder.csv'

required_columns = ['variant_id', 'product_type', 'order_id', 'user_id', 'created_at',
       'order_date', 'user_order_seq', 'outcome', 'ordered_before',
       'abandoned_before', 'active_snoozed', 'set_as_regular',
       'normalised_price', 'discount_pct', 'vendor', 'global_popularity',
       'count_adults', 'count_children', 'count_babies', 'count_pets',
       'people_ex_baby', 'days_since_purchase_variant_id',
       'avg_days_to_buy_variant_id', 'std_days_to_buy_variant_id',
       'days_since_purchase_product_type', 'avg_days_to_buy_product_type',
       'std_days_to_buy_product_type']

required_datatypes = {'variant_id': 'int64',
 'product_type': 'O',
 'order_id': 'int64',
 'user_id': 'int64',
 'created_at': 'O',
 'order_date': 'O',
 'user_order_seq': 'int64',
 'outcome': 'float64',
 'ordered_before': 'float64',
 'abandoned_before': 'float64',
 'active_snoozed': 'float64',
 'set_as_regular': 'float64',
 'normalised_price': 'float64',
 'discount_pct': 'float64',
 'vendor': 'O',
 'global_popularity': 'float64',
 'count_adults': 'float64',
 'count_children': 'float64',
 'count_babies': 'float64',
 'count_pets': 'float64',
 'people_ex_baby': 'float64',
 'days_since_purchase_variant_id': 'float64',
 'avg_days_to_buy_variant_id': 'float64',
 'std_days_to_buy_variant_id': 'float64',
 'days_since_purchase_product_type': 'float64',
 'avg_days_to_buy_product_type': 'float64',
 'std_days_to_buy_product_type': 'float64'}

In [5]:
def load_validate(path):
    df = pd.read_csv(path)

    # Check columns
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"Missing required columns: {set(required_columns) - set(df.columns)}")

    # Check nan values
    if df.isnull().any().any():
        print('Missing values found')

    # Check data types
    for col,required_type in required_datatypes.items():
        datatype = df.dtypes[col]
        if required_type != datatype:
            raise TypeError(f"Data type mismatch for column '{col}': Expected '{required_type}', but got '{datatype}'")

    else:
        print('Data loaded correctly')
    
    return df

In [7]:
df = load_validate(path)

Data loaded correctly


PREPROCESSING

In [9]:
class OrderFilter(BaseEstimator, TransformerMixin):
    ''' filter dataset to only orders with more than 4 products'''
    def fit(self, df):
        return self 
    
    def transform(self, df):
        ids = df[df.outcome == 1].groupby('order_id').variant_id.count() > 4
        df_filtered = df[df.order_id.isin(ids[ids == True].index)]
        
        return df_filtered

class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.freq_dict = {}

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_encoded = X.copy()
        categorical_cols = ['variant_id','product_type','vendor']
        for col in categorical_cols:
            self.freq_dict[col] = X[col].value_counts().to_dict()
            X_encoded[col] = X[col].map(self.freq_dict[col]).fillna(0)
        return X_encoded

In [10]:
# filter orders with more than 4 products
df = OrderFilter().fit_transform(df)        

# features and target
X = df.drop(["order_id","user_id","created_at","order_date",'outcome'],axis=1)
y = df.outcome

# train 0,7 and test 0,3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)


MODEL OPTIMIZATION AND SAVING

In [11]:
pipe = Pipeline(steps=[
        ('freq_encoder', FrequencyEncoder()),
        ('scaling', StandardScaler()),
        ('svc', LinearSVC())
        ])

params = {
    'svc__C': [0.0000001, 0.00001, 0.001, 1, 10]
}

grid = GridSearchCV(estimator=pipe, param_grid=params, scoring=['precision','recall'],refit='precision', verbose=True)
grid.fit(X_train,y_train)

score = grid.score(X_test,y_test)
best_model = grid.best_estimator_
best_params = grid.best_params_


Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [14]:
score

0.7794117647058824

In [12]:
best_params

{'svc__C': 1e-05}

In [13]:
joblib.dump(best_model, 'svc_v1.joblib')

['svc_v1.joblib']