### Get all imports and silence those pesky warninigs

In [1]:
import math
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings("ignore")

### Read Data

In [2]:
df_train = pd.read_csv("Train.csv")
df_test = pd.read_csv("Test.csv")

# This column is totally useless
df_train.drop(["Additional_Description"], axis=1, inplace=True)
df_test.drop(["Additional_Description"], axis=1, inplace=True)

# All cleaning was done on the csv file itself
# The model_info column contains a white-space at the beginning
df_train['Model_Info'] = df_train['Model_Info'].str.strip()
df_test['Model_Info'] = df_test['Model_Info'].str.strip()

df_train.head()

Unnamed: 0,Brand,Model_Info,Locality,City,State,Price
0,1,name0 name234 64gb space grey,878,8,2,15000
1,1,phone 7 name42 name453 new condition box acces...,1081,4,0,18800
2,1,name0 x 256gb leess used good condition,495,11,4,50000
3,1,name0 6s plus 64 gb space grey,287,10,7,16500
4,1,phone 7 sealed pack brand new factory outet price,342,4,0,26499


### Functions to create new features

In [3]:
# 0 honor 
# 1 iphone and iwatch
# 2 lenovo
# 3 lg


# Load the un-normalized sum of idf weighted vectors
# for both train and test sentences of the model_info column
def load_sent_vecs(path):
    fp = open(path, "rb")
    sent_vecs = pickle.load(fp)
    fp.close()
    return sent_vecs


# Check if phone is not in a good condition
# The words were extracted from what I observed in the dataset
def get_bad_condition(model_info):
    cond = list()
    for index, text in enumerate(model_info):
        if "dead" in text or "crack" in text:
            cond.append(1)
        else:
            cond.append(0)
    return cond


# Helper function to extract number for usage in ram and rom features
# Either the current word contains ram/rom data
# Or the previous word, e.g. 23 gb (previous word) or 23gb (current word)
def extract_num(text1, text2):
    # So rom is either current or previous
    try:
        space = int(''.join(filter(str.isdigit, text2)))
    except:
        space = int(''.join(filter(str.isdigit, text1)))
    return space


# Creates the rom feature
def get_rom(model_info):
    rom = list()
    for index, text in enumerate(model_info):
        space, spaces = "", list()
        for i in range(len(text)):
            # If either of these two keywords are present
            # then it means that it is either ram or rom
            if "gb" in text[i] or "gig" in text[i]:
                # Make sure "ram" is not to the right
                if (i < len(text)-1 and "ram" not in text[i+1]) or i == len(text)-1:
                    space = extract_num(text[i-1], text[i])
                # The current word could also be the first word
                # In that case, no number could be present before the current word
                elif i == 0:
                    space = extract_num(None, text[i])
                # Using a list as there could be two numbers
                # each representing ram and rom
                if type(space) == int:
                    spaces.append(space)
        # Only the largest number is rom.
        if len(spaces) > 0:
            rom.append(max(spaces))
        else:
            rom.append(0)
    return rom


# Creates the ram feature
def get_ram(model_info):
    ram = list()
    for index, text in enumerate(model_info):
        space, spaces, its_ram = "", list(), False
        for i in range(len(text)):
            # If either of these two keywords are present
            # then it means that it is either ram or rom
            if "gb" in text[i] or "gig" in text[i]:
                # If "ram" is to the right then term is definitely ram
                if i < len(text)-1 and "ram" in text[i+1]:
                    space = extract_num(text[i-1], text[i])
                    its_ram = True
                # In some cases, "ram" is not written
                elif i < len(text)-1 or i == len(text)-1:
                    space = extract_num(text[i-1], text[i])
                # The current word could also be the first word
                # In that case, no number could be present before the current word
                elif i == 0:
                    space = extract_num(None, text[i])
                if type(space) == int:
                    spaces.append(space)
        # If there is only one number and its_ram is true then it is definitely ram
        if its_ram == True and len(spaces) <= 1:
            ram.append(space)
        # If there are multiple numbers, then the minimum is ram
        elif len(spaces) > 1:
            ram.append(min(spaces))
        else:
            ram.append(0)
    return ram


# Check if phone has warranty
# The word were extracted from what I observed in the dataset
def get_warranty(model_info):
    warranty = list()
    for text in model_info:
        if "war" in text:
            warranty.append(1)
        else:
            warranty.append(0)
    return warranty


# Check if payment needs to be made in terms of cash or online
# The words were extracted from what I observed in the dataset
def get_cash(model_info):
    cash = list()
    for text in model_info:
        if "cas" in text:
            cash.append(1)
        else:
            cash.append(0)
    return cash


# This function gives two features -> iphone and iwatch
# The iphone features is the most important out of all my engineered features
# Higher price (in the real world) iphone/iwatch will have higher number
# The order of the if-else statements is crucial
# as it tells you about the priority of the products
def get_apple_product(brand_type, model_info):
    iphone_type, iwatch_type = list(), list()
    for brand, text in zip(brand_type, model_info):
        if brand == 1:
            if "watch" in text or "iwatch" in text:
                iphone_type.append(0)
                if "5" in text:
                    iwatch_type.append(5)
                elif "4" in text:
                    iwatch_type.append(4)
                elif "3" in text:
                    iwatch_type.append(3)
                elif "2" in text:
                    iwatch_type.append(2)
                elif "1" in text:
                    iwatch_type.append(1)
                else:
                    iwatch_type.append(1)
            else:
                iwatch_type.append(0)
                if ("11" in text or "eleven" in text or "elven" in text) and "pro" in text and "max" in text:
                    iphone_type.append(29)
                elif ("11" in text or "eleven" in text or "elven" in text) and "pro" in text:
                    iphone_type.append(28)
                elif "11" in text or "eleven" in text or "elven" in text:
                    iphone_type.append(27)
                elif "xs" in text and "max" in text:
                    iphone_type.append(26)
                elif "xs" in text:
                    iphone_type.append(25)
                elif "x" in text:
                    iphone_type.append(24)
                elif ("8s" in text and "plus" in text) or ("8" in text and "s" in text and "plus" in text) or ("8" in text and "splus" in text) or "8splus" in text:
                    iphone_type.append(23)
                elif ("8" in text and "plus" in text) or "8plus" in text:
                    iphone_type.append(22)
                elif ("8" in text and "s" in text) or "8s" in text:
                    iphone_type.append(21)
                elif "8" in text:
                    try:
                        next_string = text[text.index("8")+1]
                        if "month" not in next_string or "year" not in next_string or "time" not in next_string:
                            iphone_type.append(20)
                    except:
                        iphone_type.append(20)
                elif ("7s" in text and "plus" in text) or ("7" in text and "s" in text and "plus" in text) or ("7" in text and "splus" in text) or "7splus" in text:
                    iphone_type.append(19)
                elif ("7" in text and "plus" in text) or "7plus" in text:
                    iphone_type.append(18)
                elif ("7" in text and "s" in text) or "7s" in text:
                    iphone_type.append(17)
                elif "7" in text:
                    try:
                        next_string = text[text.index("7")+1]
                        if "month" not in next_string or "year" not in next_string or "time" not in next_string:
                            iphone_type.append(16)
                    except:
                        iphone_type.append(16)
                elif ("6s" in text and "plus" in text) or ("6" in text and "s" in text and "plus" in text) or ("6" in text and "splus" in text) or "6splus" in text:
                    iphone_type.append(15)
                elif ("6" in text and "plus" in text) or "6plus" in text:
                    iphone_type.append(14)
                elif ("6" in text and "s" in text) or "6s" in text:
                    iphone_type.append(13)
                elif "6" in text:
                    try:
                        next_string = text[text.index("6")+1]
                        if "month" not in next_string or "year" not in next_string or "time" not in next_string:
                            iphone_type.append(12)
                    except:
                        iphone_type.append(12)
                elif ("5s" in text and "plus" in text) or ("5" in text and "s" in text and "plus" in text) or ("5" in text and "splus" in text) or "5splus" in text:
                    iphone_type.append(11)
                elif ("5" in text and "plus" in text) or "5plus" in text:
                    iphone_type.append(10)
                elif ("5" in text and "s" in text) or "5s" in text:
                    iphone_type.append(9)
                elif ("5" in text and "c" in text) or "5c" in text:
                    iphone_type.append(8)
                elif "5" in text:
                    try:
                        next_string = text[text.index("5")+1]
                        if "month" not in next_string or "year" not in next_string or "time" not in next_string:
                            iphone_type.append(7)
                    except:
                        iphone_type.append(7)
                elif ("4s" in text and "plus" in text) or ("4" in text and "s" in text and "plus" in text) or ("4" in text and "splus" in text) or "4splus" in text:
                    iphone_type.append(6)
                elif ("4" in text and "plus" in text) or "4plus" in text:
                    iphone_type.append(5)
                elif ("4" in text and "s" in text) or "4s" in text:
                    iphone_type.append(4)
                elif ("4" in text and "c" in text) or "4c" in text:
                    iphone_type.append(3)
                elif "4" in text:
                    try:
                        next_string = text[text.index("4")+1]
                        if "month" not in next_string or "year" not in next_string or "time" not in next_string:
                            iphone_type.append(2)
                    except:
                        iphone_type.append(2)
                else:
                    iphone_type.append(1)
        else:
            iphone_type.append(0)
            iwatch_type.append(0)
    return iphone_type, iwatch_type


# Gives the same feature as iphone above but for LG
def get_lg_product(brand_type, model_info):
    lg_type= list()
    for brand, text in zip(brand_type, model_info):
        if brand == 3:
            if ("v" in text and "50" in text) or "v50" in text:
                lg_type.append(25)
            elif ("v" in text and "40" in text) or "v40" in text:
                lg_type.append(24)
            elif ("g" in text and "7" in text) or "g7" in text or "7" in text:
                lg_type.append(23)
            elif ("v" in text and "30" in text) or "v30" in text:
                lg_type.append(22)
            elif ("v" in text and "20" in text) or "v20" in text:
                lg_type.append(21)
            elif ("nexus" in text and "5" in text and "x" in text) or ("nexus" in text and "5x" in text) or ("nexus5" in text and "x" in text) or "nexus5x" in text:
                lg_type.append(20)
            elif ("g" in text and "6" in text) or "g6" in text or "6" in text:
                lg_type.append(19)  
            elif ("thin" in text and "q" in text) or "thinq" in text:
                lg_type.append(18)
            elif "plus" in text:
                lg_type.append(17)
            elif "nexus" in text:
                lg_type.append(16)
            elif ("stylush" in text and "2" in text) or "stylush2" in text:
                lg_type.append(15)
            elif "stylush" in text:
                lg_type.append(14)
            elif ("g" in text and "4" in text) or "g4" in text or "4" in text:
                lg_type.append(13)
            elif ("q" in text and "7" in text and "plus" in text) or ("q7" in text and "plus" in text) or ("q" in text and "7plus" in text) or "q7plus" in text:
                lg_type.append(12)
            elif ("g" in text and "5" in text) or "g5" in text or "5" in text:
                lg_type.append(11)
            elif ("q" in text and "6" in text) or "q6" in text:
                lg_type.append(10)
            elif "2017" in text:
                lg_type.append(9)
            elif ("q" in text and "7" in text) or "q7" in text:
                lg_type.append(8)
            elif "4g" in text and "volte" in text and "dual" in text:
                lg_type.append(7)
            elif ("g" in text and "3" in text) or "g3" in text:
                lg_type.append(6)
            elif ("w" in text and "30" in text) or "w30" in text:
                lg_type.append(5)
            elif ("k" in text and "10" in text) or "k10" in text:
                lg_type.append(4)
            elif "2" in text or "g2" in text or ("g" in text and "2" in text):
                lg_type.append(3)
            elif "ph2" in text:
                lg_type.append(2)
            else:
                lg_type.append(1)
        else:
            lg_type.append(0)
    return lg_type


# Gives the same feature as iphone above but for Huawei Honor
def get_honor_product(brand_type, model_info):
    honor_type= list()
    for brand, text in zip(brand_type, model_info):
        if brand == 0:
            if "9" in text:
                honor_type.append(29)
            elif "porsche" in text:
                honor_type.append(28)
            elif ("view" in text and "20" in text) or "view20" in text:
                honor_type.append(27)
            elif "10" in text:
                honor_type.append(26)
            elif ("view" in text and "10" in text) or "view10" in text:
                honor_type.append(25)
            elif ("8" in text and "pro" in text) or "8pro" in text:
                honor_type.append(24)
            elif ("nova" in text and "2" in text and "plus" in text) or ("nova" in text and "2plus" in text) or ("nova2" in text and "plus" in text) or "nova2plus" in text:
                honor_type.append(23)
            elif "20" in text:
                honor_type.append(22)
            elif ("nova" in text and "3" in text and "i" in text) or ("nova" in text and "3i" in text) or ("nova3" in text and "i" in text) or "nova3i" in text:
                honor_type.append(21)
            elif "8" in text:
                honor_type.append(20)
            elif ("970" in text and "i" in text) or "970i" in text:
                honor_type.append(19)
            elif ("970" in text and "i" in text) or "970i" in text:
                honor_type.append(18)
            elif ("p" in text and "20" in text and "lite" in text) or ("p" in text and "20lite" in text) or ("p20" in text and "lite" in text) or "p20lite" in text:
                honor_type.append(17)
            elif ("8" in text and "x" in text) or "8x" in text:
                honor_type.append(16)
            elif ("7" in text and "x" in text) or "7x" in text:
                honor_type.append(15)
            elif ("10" in text and "lite" in text) or "10lite" in text:
                honor_type.append(14)
            elif ("6" in text and "x" in text) or "6x" in text:
                honor_type.append(13)
            elif ("8" in text and "lite" in text) or "8lite" in text:
                honor_type.append(12)
            elif ("9" in text and "lite" in text) or "9lite" in text:
                honor_type.append(11)
            elif ("9" in text and "n" in text) or "9n" in text:
                honor_type.append(10)
            elif ("7" in text and "c" in text) or "7c" in text:
                honor_type.append(9)
            elif ("8" in text and "c" in text) or "8c" in text:
                honor_type.append(8)
            elif ("5" in text and "x" in text) or "5x" in text:
                honor_type.append(7)
            elif ("7" in text and "a" in text) or "7a" in text:
                honor_type.append(6)
            elif ("holly" in text and "4" in text and "plus" in text) or ("holly" in text and "4plus" in text) or ("holly4" in text and "plus" in text) or "holly4plus" in text:
                honor_type.append(5)
            elif ("4" in text and "x" in text) or "4x" in text:
                honor_type.append(4)
            elif "3" in text:
                honor_type.append(3)
            elif "g520" in text:
                honor_type.append(2)
            else:
                honor_type.append(1)
        else:
            honor_type.append(0)
    return honor_type


# Gives the same feature as iphone above but for Lenovo
def get_lenovo_product(brand_type, model_info):
    lenovo_type= list()
    for brand, text in zip(brand_type, model_info):
        if brand == 2:
            if "k20" in text:
                lenovo_type.append(30)
            elif "x2 lenovo" in text or "x2" in text:
                lenovo_type.append(29)
            elif "lenovop2a42" in text:
                lenovo_type.append(28)
            elif " p1 " in text:
                lg_type.append(27)
            elif "zuk" in text:
                lenovo_type.append(26)
            elif "k6 note" in text or ("k6" in text and "note" in text) or ("k" in text and "6" in text and "note" in text) or 'k6note' in text or 'notek6' in text:
                lenovo_type.append(25)
            elif "a50" in text:
                lenovo_type.append(24)  
            elif "k4 note" in text or ("k4" in text and "note" in text) or ("k" in text and "4" in text and "note" in text) or 'k4note' in text or 'notek4' in text:
                lenovo_type.append(23)
            elif "vibe k5 note" in text or ("vibe" in text and "note" in text and 'k5' in text):
                lenovo_type.append(22)
            elif "k8 note" in text or ("k8" in text and "note" in text) or ("k" in text and "8" in text and "note" in text) or 'k8note' in text or 'notek8' in text:
                lenovo_type.append(21)
            elif "a20" in text :
                lenovo_type.append(20)
            elif "z2 plus" in text or ("z2" in text and "plus" in text):
                lenovo_type.append(19)
            elif "k5 note" in text or ("k5" in text and "note" in text) or ("k" in text and "5" in text and "note" in text) or 'k5note' in text or 'notek5' in text:
                lenovo_type.append(18)
            elif "k6 power" in text or ("k6" in text and "power" in text) or ("k" in text and "6" in text and "power" in text) or 'k6power' in text:
                lenovo_type.append(17)
            elif "name42tuxedo" in text:
                lenovo_type.append(16)
            elif "phab2plus" in text or ("phab" in text and '2' in text and 'plus' in text) or ("phab2" in text and "plus" in text) or ('phab' in text and "2plus" in text):
                lenovo_type.append(15)
            elif "vibeshot" in text or ("vibe" in text and 'shot' in text):
                lenovo_type.append(14)
            elif "k8plus" in text or ("k8" in text and "plus" in text) or 'plusk8' in text or ("k" in text and "8" in text and "plus" in text) or ("k" in text and "8plus" in text):
                lenovo_type.append(13)
            elif "k8" in text or ("k" in text and "8" in text):
                lenovo_type.append(12)
            elif "vibek5plus" in text or "k5plus" in text or ("k5" in text and "plus" in text) or ("vibe" in text and "k5" in text and "plus" in text):
                lenovo_type.append(11)
            elif "13 mpl back camera 8 mpl front cam" in text:
                lenovo_type.append(10)
            elif "a6600plus" in text:
                lenovo_type.append(9)
            elif "z1" in text:
                lenovo_type.append(8)
            elif "k3 note" in text or ("k3" in text and "note" in text) or 'k3note' in text or 'notek3' in text:
                lenovo_type.append(7)
            elif "p1m40" in text:
                lenovo_type.append(6)
            elif "p1m40" in text or "vibe p1m" in text:
                lenovo_type.append(5)
            elif "k5" in text or "five" in text:
                lenovo_type.append(4)
            elif "plus" in text:
                lenovo_type.append(3)
            elif "a6000" in text:
                lenovo_type.append(2)
            else:
                lenovo_type.append(1)
        else:
            lenovo_type.append(0)
    return lenovo_type

### Create new features

In [4]:
df_train["Rom"] = get_rom([i.lower().split() for i in df_train["Model_Info"].values])
df_test["Rom"] = get_rom([i.lower().split() for i in df_test["Model_Info"].values])

df_train["Ram"] = get_ram([i.lower().split() for i in df_train["Model_Info"].values])
df_test["Ram"] = get_ram([i.lower().split() for i in df_test["Model_Info"].values])

df_train["Warranty"] = get_warranty(df_train["Model_Info"].values)
df_test["Warranty"] = get_warranty(df_test["Model_Info"].values)

df_train["Cash"] = get_cash(df_train["Model_Info"].values)
df_test["Cash"] = get_cash(df_test["Model_Info"].values)

df_train["iphone_type"], df_train["iwatch_type"] = get_apple_product(df_train["Brand"].values, [i.lower().split() for i in df_train["Model_Info"].values])
df_test["iphone_type"], df_test["iwatch_type"] = get_apple_product(df_test["Brand"].values, [i.lower().split() for i in df_test["Model_Info"].values])

df_train["LG_type"] = get_lg_product(df_train["Brand"].values, [i.lower().split() for i in df_train["Model_Info"].values])
df_test["LG_type"] = get_lg_product(df_test["Brand"].values, [i.lower().split() for i in df_test["Model_Info"].values])

df_train["Honor_type"] = get_honor_product(df_train["Brand"].values, [i.lower().split() for i in df_train["Model_Info"].values])
df_test["Honor_type"] = get_honor_product(df_test["Brand"].values, [i.lower().split() for i in df_test["Model_Info"].values])

df_train["Lenovo_type"] = get_lenovo_product(df_train["Brand"].values, [i.lower().split() for i in df_train["Model_Info"].values])
df_test["Lenovo_type"] = get_lenovo_product(df_test["Brand"].values, [i.lower().split() for i in df_test["Model_Info"].values])

df_train["Bad_condition"] = get_bad_condition(df_train["Model_Info"].values)
df_test["Bad_condition"] = get_bad_condition(df_test["Model_Info"].values)

df_train.head()

Unnamed: 0,Brand,Model_Info,Locality,City,State,Price,Rom,Ram,Warranty,Cash,iphone_type,iwatch_type,LG_type,Honor_type,Lenovo_type,Bad_condition
0,1,name0 name234 64gb space grey,878,8,2,15000,64,0,0,0,1,0,0,0,0,0
1,1,phone 7 name42 name453 new condition box acces...,1081,4,0,18800,0,0,0,0,16,0,0,0,0,0
2,1,name0 x 256gb leess used good condition,495,11,4,50000,256,0,0,0,24,0,0,0,0,0
3,1,name0 6s plus 64 gb space grey,287,10,7,16500,64,0,0,0,15,0,0,0,0,0
4,1,phone 7 sealed pack brand new factory outet price,342,4,0,26499,0,0,0,0,16,0,0,0,0,0


### Convert to numpy arrays and build the training and testing sets

In [5]:
# For sparse, count based vectorization of model_info column
vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=6)
# One-hot-encoder for the brand column
brand_ohe = OneHotEncoder(sparse=False, drop="first")

# Each feature is extracted individually from the pandas dataframe
# and converted into a matrix form for easier concatenation

# ------------------------------------- THIS IS TRAIN -------------------------------------
brand_train = brand_ohe.fit_transform(np.reshape(df_train["Brand"].values, (-1, 1)))
rom_train = np.reshape(df_train["Rom"].values, (-1, 1))
ram_train = np.reshape(df_train["Ram"].values, (-1, 1))
warranty_train = np.reshape(df_train["Warranty"].values, (-1, 1))
cash_train = np.reshape(df_train["Cash"].values, (-1, 1))
iphone_type_train = np.reshape(df_train["iphone_type"].values, (-1, 1))
lg_type_train = np.reshape(df_train["LG_type"].values, (-1, 1))
honor_type_train = np.reshape(df_train["Honor_type"].values, (-1, 1))
lenovo_type_train = np.reshape(df_train["Lenovo_type"].values, (-1, 1))
bad_cond_train = np.reshape(df_train["Bad_condition"].values, (-1, 1))
product_train = df_train["Model_Info"].values
l_train = np.reshape(df_train["Locality"].values, (-1, 1))
sent_vecs_train = load_sent_vecs("train_sents.bin")
X_train = np.concatenate((brand_train, rom_train, warranty_train, cash_train, iphone_type_train, lg_type_train, honor_type_train, lenovo_type_train, bad_cond_train, vectorizer.fit_transform(product_train).toarray(), l_train, sent_vecs_train), axis=1)

# ------------------------------------- THIS IS TEST -------------------------------------
brand_test = brand_ohe.transform(np.reshape(df_test["Brand"].values, (-1, 1)))
rom_test = np.reshape(df_test["Rom"].values, (-1, 1))
ram_test = np.reshape(df_test["Ram"].values, (-1, 1))
warranty_test = np.reshape(df_test["Warranty"].values, (-1, 1))
cash_test = np.reshape(df_test["Cash"].values, (-1, 1))
iphone_type_test = np.reshape(df_test["iphone_type"].values, (-1, 1))
lg_type_test = np.reshape(df_test["LG_type"].values, (-1, 1))
honor_type_test = np.reshape(df_test["Honor_type"].values, (-1, 1))
lenovo_type_test = np.reshape(df_test["Lenovo_type"].values, (-1, 1))
bad_cond_test = np.reshape(df_test["Bad_condition"].values, (-1, 1))
product_test = df_test["Model_Info"].values
l_test = np.reshape(df_test["Locality"].values, (-1, 1))
sent_vecs_test = load_sent_vecs("test_sents.bin")
X_test = np.concatenate((brand_test, rom_test, warranty_test, cash_test, iphone_type_test, lg_type_test, honor_type_test, lenovo_type_test, bad_cond_test, vectorizer.transform(product_test).toarray(), l_test, sent_vecs_test), axis=1)

# Target Variable
Y = df_train["Price"].values
# This performs binning of the target variable
# which defines a set of "classes" for the dataset
Y_classes = [math.ceil(i/20000)-1 for i in df_train["Price"].values]

print(X_train.shape, X_test.shape, Y.shape)

(2326, 595) (997, 595) (2326,)


### Cross-Validation 1

In [6]:
# I have created two sets of cross-validations in order to make sure that I am not overfitting
# The classes I generated above is used to create a stratified sampling based cross-validation
# This has been done in order to make both cross-validations as different as possible
kfold, scores = StratifiedKFold(n_splits=3, shuffle=True, random_state=27), list()
for train, test in kfold.split(X_train, Y_classes):
    x_train, x_test = X_train[train], X_train[test]
    y_train, y_test = np.log(Y[train]), Y[test]
    
    model = CatBoostRegressor(random_state=27, verbose=500)
    model.fit(x_train, y_train)
    preds1 = np.exp(model.predict(x_test))
    
    model = XGBRegressor(random_state=27, n_jobs=-1, objective="reg:squarederror", max_depth=6, n_estimators=100)
    model.fit(x_train, y_train)
    preds2 = np.exp(model.predict(x_test))
    
    # Perform weighted average
    preds = list()
    catb, xgb = 0.7, 0.3
    for a, b in zip(preds1, preds2):
        preds.append(a*catb + b*xgb)
    
    score = np.sqrt(mean_squared_log_error(y_test, preds))
    print(score)
    scores.append(score)
print("Average: ", sum(scores)/len(scores))

Learning rate set to 0.042197
0:	learn: 0.8197366	total: 64.8ms	remaining: 1m 4s
500:	learn: 0.2305250	total: 6.44s	remaining: 6.41s
999:	learn: 0.1359496	total: 13.2s	remaining: 0us
0.41487648263683274
Learning rate set to 0.042202
0:	learn: 0.8300776	total: 21.2ms	remaining: 21.1s
500:	learn: 0.2227492	total: 6.98s	remaining: 6.95s
999:	learn: 0.1341088	total: 14.1s	remaining: 0us
0.4334459475821377
Learning rate set to 0.042202
0:	learn: 0.8074907	total: 16ms	remaining: 16s
500:	learn: 0.2286893	total: 6.64s	remaining: 6.62s
999:	learn: 0.1362247	total: 13.1s	remaining: 0us
0.4626448421603061
Average:  0.4369890907930922


### Cross-Validation 2

In [7]:
# This is simple validation
kfold, scores = KFold(n_splits=10, shuffle=True, random_state=0), list()
for train, test in kfold.split(X_train):
    x_train, x_test = X_train[train], X_train[test]
    y_train, y_test = np.log(Y[train]), Y[test]
    
    model = CatBoostRegressor(random_state=27, verbose=500)
    model.fit(x_train, y_train)
    preds1 = np.exp(model.predict(x_test))
    
    model = XGBRegressor(random_state=27, n_jobs=-1, objective="reg:squarederror", max_depth=6, n_estimators=100)
    model.fit(x_train, y_train)
    preds2 = np.exp(model.predict(x_test))
    
    # Perform weighted average
    preds = list()
    catb, xgb = 0.7, 0.3
    for a, b in zip(preds1, preds2):
        preds.append(a*catb + b*xgb)
    
    score = np.sqrt(mean_squared_log_error(y_test, preds))
    print(score)
    scores.append(score)
print("Average: ", sum(scores)/len(scores))

Learning rate set to 0.044514
0:	learn: 0.8153081	total: 16.4ms	remaining: 16.4s
500:	learn: 0.2487873	total: 6.78s	remaining: 6.75s
999:	learn: 0.1593901	total: 13.6s	remaining: 0us
0.41733904165247115
Learning rate set to 0.044514
0:	learn: 0.8169781	total: 16.9ms	remaining: 16.9s
500:	learn: 0.2357512	total: 6.97s	remaining: 6.94s
999:	learn: 0.1420599	total: 14.3s	remaining: 0us
0.47533243654977547
Learning rate set to 0.044514
0:	learn: 0.8232443	total: 16.2ms	remaining: 16.2s
500:	learn: 0.2552304	total: 6.85s	remaining: 6.82s
999:	learn: 0.1628011	total: 13.6s	remaining: 0us
0.3928996830988315
Learning rate set to 0.044514
0:	learn: 0.8175939	total: 16.4ms	remaining: 16.4s
500:	learn: 0.2448536	total: 6.92s	remaining: 6.89s
999:	learn: 0.1552102	total: 13.9s	remaining: 0us
0.4184548692175226
Learning rate set to 0.044514
0:	learn: 0.8174288	total: 16.1ms	remaining: 16.1s
500:	learn: 0.2465476	total: 6.79s	remaining: 6.76s
999:	learn: 0.1547253	total: 13.6s	remaining: 0us
0.47030

In [8]:
# Catboost() CV1: 0.44047614405448604  CV2: 0.4300283443371219  LB: 0.42132
# Catboost() CV1: 0.4374455744964137  CV2: 0.42790331067680276  LB: 0.42286
# XGBoost() CV1: 0.4487763139480631  CV2: 0.43902423538318763  LB: Didn't submit

# ------------------------------------- FINAL -------------------------------------

# Catboost() (0.7) XGBoost(max_depth=6) (0.3) -> CV1: 0.43460516637991997  CV2: 0.4259029100829695  LB: 0.41392

### Train Final model

In [9]:
model = CatBoostRegressor(random_state=27, verbose=500)
model.fit(X_train, np.log(Y))
preds1 = np.exp(model.predict(X_test))

model = XGBRegressor(random_state=27, n_jobs=-1, objective="reg:squarederror", max_depth=6, n_estimators=100)
model.fit(X_train, np.log(Y))
preds2 = np.exp(model.predict(X_test))

preds = list()
catb, xgb = 0.7, 0.3
for a, b in zip(preds1, preds2):
    preds.append(a*catb + b*xgb)

Learning rate set to 0.045358
0:	learn: 0.8184066	total: 16.3ms	remaining: 16.3s
500:	learn: 0.2533859	total: 6.97s	remaining: 6.95s
999:	learn: 0.1609349	total: 14s	remaining: 0us


### Make final submission

In [10]:
df_submit = pd.DataFrame({'Price': preds})
df_submit.to_excel("submit.xlsx", index=False)