# Import Packages

In [None]:
import numpy as np 
import pandas as pd 
from subprocess import check_output
from sklearn import preprocessing
from sklearn import ensemble
print(check_output(["ls", "../input"]).decode("utf8"))

# Import Dataset

In [None]:
train = pd.read_csv("../input/train.tsv", sep='\t')
test = pd.read_csv("../input/test.tsv", sep='\t')

# Process Dataset

In [None]:
train['item_description'] = train['item_description'].astype(str)
test['item_description'] = test['item_description'].astype(str)

train['des_len'] = train['item_description'].apply(lambda x: len(x))
test['des_len'] = test['item_description'].apply(lambda x: len(x))

In [None]:
# words in description
train['word_count'] = train['item_description'].apply(lambda x: len(x.split()))
test['word_count'] = test['item_description'].apply(lambda x: len(x.split()))

In [None]:
# men len of words in description inversed and scaled
train['mean_des'] = train['item_description'].apply(lambda x: float(len(x.split())) / len(x))  * 10
test['mean_des'] = test['item_description'].apply(lambda x: float(len(x.split())) / len(x)) * 10 

In [None]:
# length of name
train['name_len'] = train['name'].apply(lambda x: len(x))
test['name_len'] = test['name'].apply(lambda x: len(x))

In [None]:
# words in name
train['word_name'] = train['name'].apply(lambda x: len(x.split()))
test['word_name'] = test['name'].apply(lambda x: len(x.split()))

In [None]:
# men len of words in name inversed and scaled
train['mean_name'] = train['name'].apply(lambda x: float(len(x.split())) / len(x))  * 10
test['mean_name'] = test['name'].apply(lambda x: float(len(x.split())) / len(x)) * 10 

In [None]:
train.head()

In [None]:
# missing value imputation
train.isnull().sum()

In [None]:
train['category_name'].value_counts()

In [None]:
train['category_name'].fillna('ppp/ppp/ppp', inplace=True)
test['category_name'].fillna('ppp/ppp/ppp', inplace=True)

In [None]:
train['elec'] = train['category_name'].apply(lambda x : int('electronics' in x.lower()))
test['elec'] = test['category_name'].apply(lambda x : int('electronics' in x.lower()))

In [None]:
train['category_name'].value_counts()

In [None]:
train['brand_name'].fillna('ttttttt', inplace=True)
test['brand_name'].fillna('ttttttt', inplace=True)

In [None]:
train.isnull().sum()
test.isnull().sum()

In [None]:
#length of category words
train['cat_lennn'] = train['category_name'].apply(lambda x: len(x))
test['cat_lennn'] = test['category_name'].apply(lambda x: len(x))

In [None]:
def was_priced(x):
    return int('[rm]' in x)
               
train['rm'] = train['item_description'].apply( lambda x : was_priced(x))
test['rm'] = test['item_description'].apply( lambda x : was_priced(x))        

In [None]:
train['was_described'] = 1
test['was_described'] = 1

train.loc[ train['item_description'] == 'No description yet','was_described'] = 0
test.loc[ test['item_description'] == 'No description yet','was_described'] = 0

In [None]:
# description containes 'new' word
train['new'] = train['item_description'].apply(lambda x : int('new' in x.lower()))
test['new'] = test['item_description'].apply(lambda x : int('new' in x.lower()))

In [None]:
# splitting subcategories of category_name
train_cat = pd.DataFrame(train.category_name.str.split('/',2).tolist(),
                                   columns = ['sub1','sub2', 'sub3'])
train['sub1'] = train_cat['sub1']
train['sub2'] = train_cat['sub2']
train['sub3'] = train_cat['sub3']

test_cat = pd.DataFrame(test.category_name.str.split('/',2).tolist(),
                                   columns = ['sub1','sub2', 'sub3'])

test['sub1'] = test_cat['sub1']
test['sub2'] = test_cat['sub2']
test['sub3'] = test_cat['sub3']

train.head()

In [None]:
train['hand'] = train['category_name'].apply(lambda x : int('handmade' in x.lower()))
test['hand'] = test['category_name'].apply(lambda x : int('handmade' in x.lower()))

In [None]:
train['men'] = train['category_name'].apply(lambda x : int('men' in x.lower()))
test['men'] = test['category_name'].apply(lambda x : int('men' in x.lower()))

In [None]:
# int in description
import re
train['int_desc'] = train['item_description'].apply(lambda x : int(bool(re.search(r'\d',x))))
test['int_desc'] = test['item_description'].apply(lambda x : int(bool(re.search(r'\d',x))))

In [None]:
# integer was present in name
train['int_name'] = train['name'].apply(lambda x : int(bool(re.search(r'\d',x))))
test['int_name'] = test['name'].apply(lambda x : int(bool(re.search(r'\d',x))))

In [None]:
# word condition was present in description
train['cond'] = train['item_description'].apply(lambda x : int('condition' in x.lower()))
test['cond'] = test['item_description'].apply(lambda x : int('condition' in x.lower()))

In [None]:
train['category_name'].value_counts()

In [None]:
# converting price to log scale
positive = train['price'].values > 0
negative = train['price'].values < 0
train['price'] = np.piecewise(train['price'], (positive, negative), (np.log, lambda x: -np.log(-x)))

features = ['int_name',  'cond','int_desc', 'new', 'was_described', 'men', 'rm', 'item_condition_id','cat_lennn',  'brand_name', 'shipping', 'des_len', 'name_len','mean_des', 'word_count', 'mean_name', 'word_name', 'sub1', 'sub2', 'hand', 'elec', 'category_name']

data = train[features]
data_sub = test[features]

y = train['price']

In [None]:
data_sub.head()

# Training

In [None]:
le = preprocessing.LabelEncoder()

frames = [ data, data_sub ]
xx = pd.concat(frames)

l = [ 'brand_name', 'sub1', 'sub2', 'category_name']
for x in l :
    le.fit(xx[x])
    data[x] = le.transform(data[x])
    data_sub[x] = le.transform(data_sub[x])


In [None]:
data.head()

In [None]:
clf =  ensemble.GradientBoostingRegressor( learning_rate = 0.7, n_estimators=700, max_depth = 3,warm_start = True, verbose=1, random_state=45, max_features = 0.8)
clf.fit(data, y)

# Predict and Submit Results

In [None]:
predicted = clf.predict(data_sub) 
print(features)
print( clf.feature_importances_)

In [None]:
out = pd.DataFrame()

In [None]:
out['test_id'] = test['test_id']
out['price'] = predicted
out['price'] = np.exp(out['price'])

In [None]:
out.head()
out.to_csv("mercari.csv",index=False)