In [19]:
import string
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

def strip_punct(s):
    sentences = []
    exclude = set(string.punctuation)
    exclude.add(']')
    exclude.add('[')
    exclude.add('\xad')
    exclude.add('\n')
    exclude.add('\n1')
    exclude.add('\n2')
    exclude.add('\n3')
    exclude.add('•')
    exclude.add('«')
    exclude.add('»')
    for i in range(len(s)):
        sentences.append(''.join(ch for ch in str(s[i]).lower() if ch not in exclude))
    return sentences

def vector_prep(l):
    l1 = []
    l2 = []

    for i in range(len(l)):
        l_str = ''
        l1.append(re.split(r'[/&]', str(l[i])))
        for x in range (len(l1[i])):
            l_str += l1[i][x] + ' '
        l2.append(' '.join(l_str.strip().split()))
    return l2

def feature(l1, l2, l3):
    feature = []
    for i in range(len(l1)):
        feature.append(l1[i].lower() + ' ' + l2[i].lower() + ' ' + l3[i].lower())
    return feature

def cros_val(model, x_train_array, price_train, n_run):
    print ('Cross-validation: {}'.format(cross_val_score(model, x_train_array, price_train, cv=n_run)))

In [9]:
df = pd.read_csv('data/train.tsv', sep='\t')
df = df.sample(frac=1).reset_index(drop=True)

#Take only part of the training data to work on
other, train = train_test_split(df, test_size=0.1)

#Split the train into test and train data
train, test = train_test_split(train, test_size=0.3)

In [10]:

category_name_train = train['category_name'].tolist()
name_train = train['name'].tolist()
brand_train = train['brand_name'].tolist()
price_train = train['price'].tolist()

category_name_test = test['category_name'].tolist()
name_test = test['name'].tolist()
brand_test = test['brand_name'].tolist()
price_test = test['price'].tolist()

In [12]:
category_l_train = vector_prep(category_name_train)
name_l_train = vector_prep(name_train)
brand_l_train = vector_prep(brand_train)

category_l_test = vector_prep(category_name_test)
name_l_test = vector_prep(name_test)
brand_l_test = vector_prep(brand_test)

feature_train = feature(category_l_train, name_l_train, brand_l_train)
feature_test = feature(category_l_test, name_l_test, brand_l_test)

In [13]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(feature_train)
#x_train_array = X.toarray()

In [14]:
model = LinearRegression()
model_l = model.fit(X, price_train)

In [15]:
#corpus_test = ['small cat harness and leash Other Pet Supplies Others']
newVec = CountVectorizer(vocabulary=vectorizer.vocabulary_)
X_test = newVec.fit_transform(feature_test)
prices_predicted = model_l.predict(X_test).tolist()

In [16]:
n_run = 5 #number of runs
cros_val(model_l, X, price_train, n_run)

Cross-validation: [ 0.17107795  0.13800486  0.03985989  0.22915635  0.17793956]


### Decision Trees:

In [None]:
# Fit regression model
regr_tree = DecisionTreeRegressor()
regr_tree.fit(X, price_train)
n_run = 5 #number of runs
cros_val(regr_tree, X, price_train, n_run)

### SVR (support vector regression):

In [None]:
from sklearn.svm import SVR

svr_poly  = SVR(kernel='poly', C=1e3, degree=2)
svr_model = svr_poly.fit(X, price_train)

In [None]:
n_run = 5 #number of runs
cros_val(svr_model, X, price_train, n_run)

### Simple NN:

In [None]:
def larger_model():
    # create model
    model = Sequential()
    model.add(Dense(13, input_dim=13, kernel_initializer='normal', activation='relu'))
    model.add(Dense(6, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model