In [2]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, Ridge
import joblib
from joblib import dump

In [3]:
df_data=pd.read_csv("./data/all_tables_combined.csv")

In [4]:
df_s_parent=df_data

#length of campaign. one of the parameters that can possibly used as a feature.
df_s_parent['NUM_DAYS']=df_s_parent['END_DAY'] - df_s_parent['START_DAY'] + 1

#dropping a few extra columns
df_s_parent.drop(['START_DAY','END_DAY','COUPON_UPC','household_key','PRODUCT_ID'], axis=1, inplace=True)

#selecting rows where the product is some type of liquid milk
df_s_parent[df_s_parent['SUB_COMMODITY_DESC'].str.contains('MILK')]['SUB_COMMODITY_DESC'].unique()
df_s_parent=df_s_parent[(df_s_parent['SUB_COMMODITY_DESC']=='FLUID MILK WHITE ONLY') | (df_s_parent['SUB_COMMODITY_DESC']=='MISCELLANEOUS MILK') | (df_s_parent['SUB_COMMODITY_DESC']=='CHOCOLATE MILK')]

#dropping more extra columns
df_s_parent.drop(['TRANS_TIME','AGE_DESC', 'MARITAL_STATUS_CODE', 'INCOME_DESC', 'HOMEOWNER_DESC', 'HH_COMP_DESC', 'HOUSEHOLD_SIZE_DESC','KID_CATEGORY_DESC', 'DEPARTMENT', 'COMMODITY_DESC'], axis=1, inplace=True)

#function to encode container size
def label_to_volume(label):
    volume=int()
    if label=='1 GA':
        volume=128
    elif label=='1 PT':
        volume=16
    elif label=='16 OZ':
        volume=16
    elif label=='1 QT':
        volume=32
    elif label=='PT':
        volume=16
        
    return volume

#we need to vectorize function to use it with pandas dataframe efficiently.
vect_label_to_volume=np.vectorize(label_to_volume)

df_s_parent['VOLUME']=vect_label_to_volume(df_s_parent['CURR_SIZE_OF_PRODUCT'])

df_s_parent=df_s_parent[df_s_parent['SALES_VALUE']<=10]

sales=df_s_parent['SALES_VALUE']

df_s_parent_features=df_s_parent.drop('SALES_VALUE', axis=1)

dummy=pd.get_dummies(df_s_parent_features['BRAND'],drop_first=True)
dummy_milk_type=pd.get_dummies(df_s_parent_features['SUB_COMMODITY_DESC'],drop_first=True)
dummy_manufacturer=pd.get_dummies(df_s_parent_features['MANUFACTURER'],drop_first=True)
df_s_parent_features=pd.concat([df_s_parent_features,dummy], axis=1)
df_s_parent_features=pd.concat([df_s_parent_features,dummy_milk_type], axis=1)
df_s_parent_features=pd.concat([df_s_parent_features,dummy_manufacturer], axis=1)

#removing original columns that have been dummified.
df_s_parent_features.drop(['STORE_ID','QUANTITY','BRAND','description','MANUFACTURER','CURR_SIZE_OF_PRODUCT','SUB_COMMODITY_DESC', 'CAMPAIGN', 'NUM_DAYS'], axis=1, inplace=True)

#define X and y
X=df_s_parent_features
y=sales

#test train split
X_train,X_test,y_train,y_test=train_test_split(X, y, random_state = 22)

#initiate instance of StandardScaler
sscaler=StandardScaler()

#fit and transform X_train
X_train_scaled=sscaler.fit_transform(X_train)

#Using fit from train to scale test
X_test_scaled=sscaler.transform(X_test)

#initiating and fitting lasso
lasso=Lasso(alpha=0.00001)
lasso.fit(X_train_scaled,y_train)

#create dataframe with coffecients for all features used in modeling
pd.DataFrame({'features':X.columns, 'coefs':lasso.coef_}).sort_values(by='coefs', ascending=False)

#initiating and fitting Ridge
ridge=Ridge()
ridge.fit(X_train_scaled,y_train)

#create models for 
dump(lasso, 'sales_value_lasso_model.joblib')
dump(ridge, 'sales_value_ridge_model.joblib')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_s_parent['VOLUME']=vect_label_to_volume(df_s_parent['CURR_SIZE_OF_PRODUCT'])
  model = cd_fast.enet_coordinate_descent(


['sales_value_ridge_model.joblib']