<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#loading-data" data-toc-modified-id="loading-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>loading data</a></span></li><li><span><a href="#feature-engineering" data-toc-modified-id="feature-engineering-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>feature engineering</a></span></li><li><span><a href="#cross-validation" data-toc-modified-id="cross-validation-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>cross validation</a></span></li><li><span><a href="#submission" data-toc-modified-id="submission-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>submission</a></span></li></ul></div>

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

def scoring(y_true, y_pred):
    from sklearn.metrics import mean_squared_error, mean_squared_log_error

    return np.sqrt(mean_squared_error(y_true, y_pred))

### loading data

In [2]:
train = pd.read_csv("Train.csv", index_col=None)
print(train.shape)
train.head()

(2326, 7)


Unnamed: 0,Brand,Model_Info,Additional_Description,Locality,City,State,Price
0,1,name0 name234 64gb space grey,1yesr old mobile number 999two905two99 bill c...,878,8,2,15000
1,1,phone 7 name42 name453 new condition box acce...,101004800 1010065900 7000,1081,4,0,18800
2,1,name0 x 256gb leess used good condition,1010010000 seperate screen guard 3 back cover...,495,11,4,50000
3,1,name0 6s plus 64 gb space grey,without 1010020100 id 1010010300 colour 10100...,287,10,7,16500
4,1,phone 7 sealed pack brand new factory outet p...,101008700 10100000 xs max 64 gb made 10100850...,342,4,0,26499


In [3]:
test = pd.read_csv("Test.csv", index_col=None)
print(test.shape)
test.head()

(997, 6)


Unnamed: 0,Brand,Model_Info,Additional_Description,Locality,City,State
0,1,name0 55s66s66s778xxsxsmax etc,good condition 11months old single scratch we...,570,11,4
1,1,slightly used excellent condition name0 5 sale,101008700 1010030600 1010034300 10100192200 1...,762,8,2
2,1,name0 sx ios12 top letast model bill call,1010017300 delivery,60,13,5
3,1,name87 name0 x 64gb going lowest 41900,phone 1010023400 64 gb excellent condition sale,640,15,5
4,1,name0 5s proper condition one handedly used,full kit available 10100248300 condition 4gb ...,816,2,6


In [4]:
target = 'Price'
cat_cols = ['Brand', 'Model_Info', 'Additional_Description', 'Locality', 'City', 'State']
print("cat cols {}".format(cat_cols))

cat cols ['Brand', 'Model_Info', 'Additional_Description', 'Locality', 'City', 'State']


In [5]:
train[target] = np.log(train[target])

### feature engineering

In [6]:
def feature_engineering(df):
    import scipy
    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

    tfidf_vect = CountVectorizer(
        analyzer='word',
        token_pattern=r'\w{1,}',
        # gram_range=(1,2),
        strip_accents='unicode',
        stop_words='english'
    )
    tfidf_vect.fit(df['Model_Info'].unique())
    X = tfidf_vect.transform(df['Model_Info'].values)
    X = scipy.sparse.hstack((X, pd.get_dummies(df[['City', 'State', 'Brand']],
                                               columns=['City', 'State', 'Brand'])))

    return X

In [7]:
overall = train.append(test, ignore_index=True, sort=False)
y = train[target]

overall = feature_engineering(overall)

train_data, test_data = overall.toarray()[:len(train)], overall.toarray()[len(train):]
train_data.shape, test_data.shape

((2326, 1638), (997, 1638))

### cross validation

In [8]:
%%time
import lightgbm as lgb
from sklearn.model_selection import KFold

preds = []
valid_score_list = []
cv = KFold(n_splits=10)
for i, (train_ind, test_ind) in enumerate(cv.split(train_data, y)):
    X_train = train_data[train_ind]
    X_valid = train_data[test_ind]
    X_test = test_data.copy()

    y_train = y[train_ind].values
    y_valid = y[test_ind].values

    clf = lgb.LGBMRegressor(n_jobs=-1,
                            n_estimators=10000, 
                            learning_rate=0.01,                            
                            min_child_samples=1, 
                            colsample_bytree=0.8,
                            metric='rmse')
    clf.fit(X_train, y_train,
            eval_set=[(X_valid, y_valid)], 
            early_stopping_rounds=100, 
            verbose=0)
    
    valid_prediction = clf.predict(X_valid)
    test_prediction = clf.predict(X_test)
    valid_score = scoring((y_valid), (valid_prediction))
    valid_score_list.append(valid_score)
    print("FOLD:{} SCORE:{:0.3f}".format(i+1, valid_score))
    
    preds.append(test_prediction)
    
print("CV score is {}".format(np.mean(valid_score_list)))

FOLD:1 SCORE:0.428
FOLD:2 SCORE:0.438
FOLD:3 SCORE:0.434
FOLD:4 SCORE:0.431
FOLD:5 SCORE:0.324
FOLD:6 SCORE:0.444
FOLD:7 SCORE:0.381
FOLD:8 SCORE:0.491
FOLD:9 SCORE:0.395
FOLD:10 SCORE:0.439
CV score is 0.42048854203350716
CPU times: user 4min 4s, sys: 3.41 s, total: 4min 7s
Wall time: 21.6 s


### submission

In [9]:
prediction = np.mean(preds, axis=0)
prediction = np.exp(prediction)

In [10]:
sub = pd.DataFrame(prediction, columns=[target])
sub.to_excel("lgb_v2.xlsx", index=None)
sub.head()

Unnamed: 0,Price
0,15479.705248
1,8342.797437
2,14422.865689
3,44266.047814
4,7560.477714
