In [170]:
%reset -f

In [179]:
import pandas as pd
import numpy as np

%matplotlib inline 
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('./sales.csv')
df.dropna(subset=['price'], inplace=True)
df.head()


Unnamed: 0,cost,price,weight,purchase_date,product_type,product_level,maker,ingredient,height,width,depth
0,$333k,"$300,492",3 Ton 90 Kg,Dec 19 2008,"Q,B",advanced,M14122,"IN732052,IN732053",2.76 meters,97 cm,26 cm
1,,"$430,570",3 Ton 30 Kg,Sep 10 1997,"J,D",basic,,"IN732054,IN732055,IN732056,IN732057,IN732058",2.67 meters,98 cm,26 cm
2,$270k,"$213,070",3 Ton 40 Kg,Sep 05 2001,"J,D",basic,,"IN732054,IN732059,IN732060",3.0 meters,93 cm,24 cm
3,,"$229,174",3 Ton 50 Kg,Dec 23 2016,U,advanced,M14123,"IN732061,IN732062,IN732063",2.5 meters,102 cm,27 cm
4,$97k,"$122,659",2 Ton 970 Kg,Jan 12 2000,"D,R",advanced,,"IN732064,IN732065,IN732066",2.47 meters,101 cm,26 cm


In [180]:
df['year'] = pd.to_datetime(df.purchase_date).dt.year

In [181]:
import pandas as pd
import numpy as np
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
#from sklearn.impute import KNNImputer
#from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MultiLabelBinarizer,FunctionTransformer
from sklearn.linear_model import Lasso
#from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
#from sklearn.model_selection import train_test_split
#from sklearn.ensemble import RandomForestRegressor
#from sklearn.ensemble import GradientBoostingRegressor
#from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
#import math
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler,RobustScaler,StandardScaler,MaxAbsScaler

#from sklearn.decomposition import PCA

class DataTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        df = pd.DataFrame()
        df['cost'] = X['cost'].map(self.cost2num)
        df['weight'] = X['weight'].map(self.weight2num)
        df['height'] = X['height'].map(self.height2num)
        df['width'] = X['width'].map(self.width2num)        
        df['depth'] = X['depth'].map(self.depth2num)
        df['volume'] = 100 * df['height'] * df['width'] * df['depth']
        self.median = df.median(skipna=True)
    
    def transform(self, X):
        # Extract and convert numerical features
        df = pd.DataFrame()
        df['cost'] = X['cost'].map(self.cost2num)
        df['cost'] =df['cost'].fillna(85000)
        df['weight'] = X['weight'].map(self.weight2num)
        df['height'] = X['height'].map(self.height2num)
        df['width'] = X['width'].map(self.width2num)        
        df['depth'] = X['depth'].map(self.depth2num)
        df['volume'] = 100 * df['height'] * df['width'] * df['depth']
        df.drop(['height', 'width', 'depth'], axis=1, inplace=True)
        df = df.fillna(self.median)
        df['year'] = pd.to_datetime(X['purchase_date']).dt.year
        df['month'] = pd.to_datetime(X['purchase_date']).dt.month
        df['weekday'] = pd.to_datetime(X['purchase_date']).dt.weekday
        X['ingredient']=X['ingredient'].fillna('unknown')
        df['IngredientCounts'] = X['ingredient'].apply(self.count_ingredients)
        return df

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

    def weight2num(self, x):
        if pd.notnull(x):
            if 'Ton' in x:
                try:
                    return float(x.split(' Ton')[0]) * 1000
                except ValueError:
                    return np.nan  # Handle the case where the conversion to float fails
            else:
                try:
                    return float(x.strip('Kg'))
                except ValueError:
                    return np.nan  # Handle the case where the conversion to float fails
        return x
    
    def height2num(self, x):
        if type(x) == str:
            x = x.strip('meters').strip()
            return float(x) * 100
        else:
            return x        
    
    def width2num(self, x):
        if type(x) == str:
            x = x.strip('cm').strip()
            return float(x)
        else:
            return x   

    def depth2num(self, x):
        if type(x) == str:
            x = x.strip('cm').strip()
            return float(x)
        else:
            return x  
    def cost2num(self, x):
        if type(x) == str:
            x = x.strip('$').strip('k')
            return float(x)*1000
        else:
            return x
        
    def count_ingredients(self, x):
        if x:
            ingredients_list = x.split(',')
            return len(ingredients_list)
        else:
            return 0

In [182]:
def price2num(x):
    if type(x) == str: 
        x = x.strip('$').replace(',', '')
    return float(x)

df['price']=df['price'].map(price2num)

In [183]:
train_raw = df[df.year < 2015].reset_index(drop=True)
test_raw = df[df.year >= 2015].reset_index(drop=True)

numeric_features_lr = ['cost','weight', 'height', 'width', 'depth', 'purchase_date','ingredient']
categorical_lr = ['product_type']

target=['price']

X_train = train_raw[numeric_features_lr + categorical_lr ]
y_train = train_raw[target]

X_test = test_raw[numeric_features_lr + categorical_lr]
y_test = test_raw[target]

ctf=DataTransformer()
test=ctf.fit_transform(X_train)
ctf.fit_transform(df).cost.median()

85000.0

In [184]:
steps=[
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('numeric', DataTransformer(), numeric_features_lr),  # Include 'cost' in the numeric features
            ('categorical_PT', OneHotEncoder(handle_unknown='ignore', sparse=True), categorical_lr)  # Produce dense matrix
        ],
        remainder='drop'
    )),
    ('astype', FunctionTransformer(lambda x: x.astype('float32'))),
    ('scaler', MaxAbsScaler()),
    ('polynomial', PolynomialFeatures(degree=2)),
#    ('linear_regression', LinearRegression())
    ('lasso', Lasso(100))
]

model = Pipeline(steps)

model.fit(X_train, y_train)

In [185]:
model_lr = Pipeline(steps)
model_lr = model_lr.fit(X_train, y_train)

In [186]:
y_train_pred = model_lr.predict(X_train)
print('train MAE: {0:.2e}'.format(mean_absolute_error(y_train, y_train_pred)))
print('train MSE: {0:.2e}'.format(mean_squared_error(y_train, y_train_pred)))
print('train R2: {0:.3f}'.format(r2_score(y_train, y_train_pred)))

y_test_pred = model_lr.predict(X_test)
print('test MAE: {0:.2e}'.format(mean_absolute_error(y_test, y_test_pred)))
print('test MSE: {0:.2e}'.format(mean_squared_error(y_test, y_test_pred)))
print('test R2: {0:.3f}'.format(r2_score(y_test, y_test_pred)))

train MAE: 9.52e+04
train MSE: 2.05e+10
train R2: 0.518
test MAE: 1.18e+05
test MSE: 4.29e+10
test R2: 0.502


In [187]:
#%load_ext memory_profiler
#%memit -r 1 your_function()

In [188]:
features = list(train_raw.columns)
target = 'price'
features.remove(target)

In [189]:
def price2num(x):
    if type(x) == str: 
        x = x.strip('$').replace(',', '')
    return float(x)

In [None]:
X = train_raw[features]
y = train_raw[target].map(price2num)

X_test = test_raw[features]
y_test = test_raw[target].map(price2num)
len(X)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, 
                                                      y, 
                                                      test_size=0.2, 
                                                      random_state=2019)


from sklearn.model_selection import KFold

kf=KFold(n_splits = 5)

i =1
for train_index, valid_index in kf.split(X):
    print('Split{}'.format(i), train_index, valid_index, sep='\n')
    i+=1

In [None]:
alphas = range(1,30)
err_alphas = []

for alpha in alphas:
    steps_cv=[
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('numeric', DataTransformer(), numeric_features_lr),  # Include 'cost' in the numeric features
            ('categorical', OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_features_lr)  # Produce dense matrix
        ],
        remainder='drop'
    )),
        ('poly',PolynomialFeatures(degree=30)),
        ('rescale',MinMaxScaler()),
        ('lr',Lasso(alpha=alpha,max_iter = 100000))
          ]
    mse_kf = []
    for train_index,valid_index in kf.split(X):
        X_train,X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train,y_valid = y.iloc[train_index], y.iloc[valid_index]
        model_kf = Pipeline(steps_cv)
        model_kf = model_kf.fit(X_train, y_train)
        mse_kf.append(mean_squared_error(y_valid,model_kf.predict(X_valid)))
    err_alphas.append(np.mean(mse_kf))