### Get all imports

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
from lightgbm import LGBMRegressor
from imblearn.over_sampling import SMOTE
from collections import Counter
from reg_resampler import resampler
from sklearn.metrics import mean_squared_log_error
import warnings
warnings.filterwarnings("ignore")

### Read and transform data

In [2]:
df_train = pd.read_csv("Train.csv")

lone = LabelEncoder()
df_train["Product"] = lone.fit_transform(df_train["Product"])
df_train["Product_Brand"] = lone.fit_transform(df_train["Product_Brand"])
df_train["Item_Category"] = lone.fit_transform(df_train["Item_Category"])
df_train["Subcategory_1"] = lone.fit_transform(df_train["Subcategory_1"])
df_train["Subcategory_2"] = lone.fit_transform(df_train["Subcategory_2"])
df_train = df_train.drop("Date", axis=1)

df_train.head()

Unnamed: 0,Product,Product_Brand,Item_Category,Subcategory_1,Subcategory_2,Item_Rating,Selling_Price
0,791,862,7,10,137,4.3,291
1,760,670,10,127,329,3.1,897
2,1746,280,29,112,101,3.5,792
3,1214,670,8,37,134,4.0,837
4,2104,670,10,80,296,4.3,470


### Perform K-Fold

In [3]:
# Initialize the resampler object
rs = resampler()

# Generate classes
Y_classes = rs.fit(df_train.values, target=-1, bins=5, verbose=2)
# Create the actual target variable
Y = df_train["Selling_Price"]

# Perform K-Fold
kfold, scores = KFold(n_splits=5, shuffle=True, random_state=27), list()
for train, test in kfold.split(df_train.values):
    # Split into train and test
    x_train, y_train = df_train.values[train], Y_classes[train]
    x_test, y_test = df_train.values[test], Y.values[test]
    
    # Remove the target variable from x_test
    x_test = x_test[:,:-1]
    
    # Get the class distriubtion for perfoming relative sampling in the next line
    xp = Counter(y_train)
    # Your favourite oversampler
    smote = SMOTE(random_state=27, sampling_strategy={0:int(xp[0]*1.2), 1: int(xp[1]*2.0)})
    # Generate the over-sampled data
    x_train, y_train = rs.resample(smote, x_train, y_train)
    
    # Fit the model
    model = LGBMRegressor(random_state=27)
    model.fit(x_train, np.log(y_train))
    preds = np.exp(model.predict(x_test))
    
    # Check the score
    score = np.sqrt(mean_squared_log_error(y_test, preds))
    print(score)
    scores.append(score)
print("\nAverage: ", sum(scores)/len(scores))

INFO: Class 2 has been merged into Class 1 due to low number of samples
INFO: Class 3 has been merged into Class 1 due to low number of samples
INFO: Class 4 has been merged into Class 1 due to low number of samples

Class Distribution:
-------------------
0: 2350
1: 102

0.7399142441762613
0.7027917932595997
0.7526839471836929
0.6559476353228568
0.728284943818841

Average:  0.7159245127522503


### Perform Stratified K-Fold

In [5]:
# Initialize the resampler object
rs = resampler()

# Generate classes
Y_classes = rs.fit(df_train.values, target=-1, bins=5, verbose=2)
# Create the actual target variable
Y = df_train["Selling_Price"]

# Perform K-Fold
kfold, scores = StratifiedKFold(n_splits=5, shuffle=True, random_state=27), list()
for train, test in kfold.split(df_train.values, Y_classes):
    # Split into train and test
    x_train, y_train = df_train.values[train], Y_classes[train]
    x_test, y_test = df_train.values[test], Y.values[test]
    
    # Remove the target variable from x_test
    x_test = x_test[:,:-1]
    
    # Get the class distriubtion for perfoming relative sampling in the next line
    xp = Counter(y_train)
    # Your favourite oversampler
    smote = SMOTE(random_state=27, sampling_strategy={0:int(xp[0]*1.2), 1: int(xp[1]*2.0)})
    # Generate the over-sampled data
    x_train, y_train = rs.resample(smote, x_train, y_train)
    
    # Fit the model
    model = LGBMRegressor(random_state=27)
    model.fit(x_train, np.log(y_train))
    preds = np.exp(model.predict(x_test))
    
    # Check the score
    score = np.sqrt(mean_squared_log_error(y_test, preds))
    print(score)
    scores.append(score)
print("\nAverage: ", sum(scores)/len(scores))

INFO: Class 2 has been merged into Class 1 due to low number of samples
INFO: Class 3 has been merged into Class 1 due to low number of samples
INFO: Class 4 has been merged into Class 1 due to low number of samples

Class Distribution:
-------------------
0: 2350
1: 102

0.681783831084692
0.6997998885546327
0.7386469827713933
0.6996689571871663
0.7698032886366912

Average:  0.7179405896469151
