In [1]:
import pandas as pd
import csv
from scipy import spatial
from sklearn import linear_model
import numpy as np

In [4]:
import zipfile
def unzip(path_to_zip_file):
    zip_ref = zipfile.ZipFile(path_to_zip_file, 'r')
    zip_ref.extractall("data/")
    zip_ref.close()
    
unzip("data/attributes.csv.zip")
unzip("data/product_descriptions.csv.zip")
unzip("data/sample_submission.csv.zip")
unzip("data/train.csv.zip")
unzip("data/test.csv.zip")

df_attributes = pd.read_csv("data/attributes.csv")
df_product_desc = pd.read_csv("data/product_descriptions.csv")
df_sample_submission = pd.read_csv("data/sample_submission.csv")
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")


def read_csv_to_list(filename):
    with open(filename, 'rb') as f:
        reader = csv.reader(f, )
        l = list(reader)
        return l[1:]  # omit header

attributes = read_csv_to_list("data/attributes.csv")
product_desc = read_csv_to_list("data/product_descriptions.csv")
train = read_csv_to_list("data/train.csv")
test = read_csv_to_list("data/test.csv")

In [3]:
df_attributes.head()

Unnamed: 0,product_uid,name,value
0,100001.0,Bullet01,Versatile connector for various 90° connection...
1,100001.0,Bullet02,Stronger than angled nailing or screw fastenin...
2,100001.0,Bullet03,Help ensure joints are consistently straight a...
3,100001.0,Bullet04,Dimensions: 3 in. x 3 in. x 1-1/2 in.
4,100001.0,Bullet05,Made from 12-Gauge steel


In [4]:
df_product_desc.head()

Unnamed: 0,product_uid,product_description
0,100001,"Not only do angles make joints stronger, they ..."
1,100002,BEHR Premium Textured DECKOVER is an innovativ...
2,100003,Classic architecture meets contemporary design...
3,100004,The Grape Solar 265-Watt Polycrystalline PV So...
4,100005,Update your bathroom with the Delta Vero Singl...


In [5]:
df_sample_submission.head()

Unnamed: 0,id,relevance
0,1,1
1,4,1
2,5,1
3,6,1
4,7,1


In [6]:
df_train.head()

Unnamed: 0,id,product_uid,product_title,search_term,relevance
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33
4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67


In [7]:
df_test.head()

Unnamed: 0,id,product_uid,product_title,search_term
0,1,100001,Simpson Strong-Tie 12-Gauge Angle,90 degree bracket
1,4,100001,Simpson Strong-Tie 12-Gauge Angle,metal l brackets
2,5,100001,Simpson Strong-Tie 12-Gauge Angle,simpson sku able
3,6,100001,Simpson Strong-Tie 12-Gauge Angle,simpson strong ties
4,7,100001,Simpson Strong-Tie 12-Gauge Angle,simpson strong tie hcc668


In [8]:
%time
tokens = {}
idx = 0

for i, row in enumerate(train):
    assert len(row) == 5, row
    for token in row[2].replace(".","").replace("(","").replace(")","").split(" "):
        if token not in tokens.keys():
            tokens[token] = idx
            idx += 1
    for token in row[3].replace(".","").replace("(","").replace(")","").split(" "):
        if token not in tokens.keys():
            tokens[token] = idx
            idx += 1
    if i == 10:
        break
        
print tokens

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs
{'DeckOver': 10, 'deck': 18, 'over': 19, 'Sensor': 50, 'Premium': 8, 'Emergency': 63, 'only': 36, 'Lighting': 58, 'Not': 31, '8': 74, 'stove': 55, 'emergency': 66, '3/4': 71, 'Stakes': 83, 'Over': 42, 'l': 6, 'x': 72, 'Coating': 17, 'Casing': 77, '1-gal': 11, 'mdf': 78, 'Only': 24, 'microwave': 54, '3': 73, 'Vero': 21, 'Metal': 82, 'Range': 44, 'faucet': 37, 'Fluted': 76, 'of': 69, 'Convection': 45, '4-Pack': 84, '#SC-141': 12, 'Industries': 81, 'Stainless': 47, 'steele': 85, 'Black': 61, '1-Handle': 22, 'Strong-Tie': 1, 'cu': 40, 'Whirlpool': 38, 'Fara': 70, 'MDF': 75, 'Delta': 20, 'Included': 32, 'Steel': 48, 'head': 35, 'Valve': 30, 'Textured': 9, 'Concrete': 16, 'otr': 53, 'with': 49, 'LED': 62, '19': 39, 'House': 68, 'Fixture': 64, 'shower': 34, 'Quantum': 59, 'Unit': 65, 'and': 15, 'Angle': 3, 'Chrome': 29, 'Cooking': 51, 'Faucet': 25, 'Microwave': 46, 'Wood': 14, 'BEHR': 7, 'in': 28, 'convection': 52, 'Valley': 79, 

In [9]:
%time
product_title_space = []
search_term_space = []
for i, row in enumerate(train):
    product_title_vector = [0]*idx
    for token in row[2].replace(".","").replace("(","").replace(")","").split(" "):
        j = tokens.get(token)
        product_title_vector[j] = 1
    product_title_space.append(product_title_vector)
    
    search_term_vector = [0]*idx
    for token in row[3].replace(".","").replace("(","").replace(")","").split(" "):
        j = tokens.get(token)
        search_term_vector[j] = 1
    search_term_space.append(search_term_vector)

    if i == 10:
        break

        
print product_title_space

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 6.2 µs
[[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [10]:
pd.DataFrame(product_title_space).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,77,78,79,80,81,82,83,84,85,86
0,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
pd.DataFrame(search_term_space).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,77,78,79,80,81,82,83,84,85,86
0,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
cos_dists = []
actual_relevance = []
for i, row in enumerate(train):
    cos_dists.append([spatial.distance.cosine(product_title_space[i], search_term_space[i])])
    actual_relevance.append(row[4])
    if i == 10:
        break
print cos_dists
print actual_relevance

[[1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [0.79587585476806855], [1.0]]
['3', '2.5', '3', '2.33', '2.67', '3', '2.67', '3', '2.67', '3', '2.67']


In [14]:
from sklearn import datasets
import numpy as np

# Load the diabetes dataset
diabetes = datasets.load_diabetes()


# Use only one feature
diabetes_X = diabetes.data[:, np.newaxis, 2]

# Split the data into training/testing sets
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]

# Split the targets into training/testing sets
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test = diabetes.target[-20:]

In [35]:
print diabetes_X_train.shape

(422, 1)


In [47]:
print diabetes_y_train.shape

(422,)


In [45]:
x_train = np.array(cos_dists)
y_train = np.array(actual_relevance)
print x_train.shape
print y_train.shape
print type(y_train)

(11, 1)
(11,)
<type 'numpy.ndarray'>


In [42]:
x_test = np.array(cos_dists)
y_test = np.array(actual_relevance)

In [43]:
from sklearn import linear_model

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(x_train, y_train)

# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean square error
print("Residual sum of squares: %.2f"
      % np.mean((regr.predict(x_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(x_test, y_test))

# Plot outputs
# plt.scatter(diabetes_X_test, diabetes_y_test,  color='black')
# plt.plot(diabetes_X_test, regr.predict(diabetes_X_test), color='blue',
#          linewidth=3)

# plt.xticks(())
# plt.yticks(())

# plt.show()

TypeError: cannot perform reduce with flexible type