In [38]:
import csv, string, re, time, nltk, sys
import numpy as np
import pandas as pd
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression

# Uncomment if you haven't downloaded nltk stopwords
# nltk.download('stopwords')
stopwords = stopwords.words('english') # Stopwords = common words, useful to filter them out)

### Parse data

In [39]:
def tsv_parser(filename, delimiter='\t', encoding='utf-8'):
    with open(filename, 'r', encoding=encoding) as tsv_file:
        tsv_reader = csv.reader(tsv_file, delimiter=delimiter)
        df = pd.DataFrame(data=list(tsv_reader))
    return df

# Using function to extract the data from the files
print('Parsing data from TSV file...')
start_time = time.time()
df = tsv_parser('train.tsv')
print(f'Done. Took {time.time()-start_time:.2f}s')

Parsing data from TSV file...
Done. Took 8.61s


### Prepare data

In [40]:
# Use first row for column names, then drop it from the data
df.columns = df.iloc[0]
df = df.reindex(df.index.drop(0))

# Could either use train_id for the index, or drop it and stick to pandas' index. Uncomment as appropriate.
# df.set_index('train_id', inplace=True) # Use train_id as the index
del df['train_id'] # Drop train id

df.head()

Unnamed: 0,name,item_condition_id,category_name,brand_name,price,shipping,item_description
1,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
2,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
3,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
4,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
5,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [41]:
# We will now split the dataframe into X (features/inputs) and y (target variable).
y = df.loc[:, 'price']
y = pd.to_numeric(y)
y.head()

1    10.0
2    52.0
3    10.0
4    35.0
5    44.0
Name: price, dtype: float64

In [42]:
X = df
del X['price']
X.head()

Unnamed: 0,name,item_condition_id,category_name,brand_name,shipping,item_description
1,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,1,No description yet
2,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,0,This keyboard is in great condition and works ...
3,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,1,Adorable top with a hint of lace and a key hol...
4,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,1,New with tags. Leather horses. Retail for [rm]...
5,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,0,Complete with certificate of authenticity


In [92]:
from collections import defaultdict

num_categories_dict = defaultdict(int)
for i, category in enumerate(X['category_name']):
    categories = category.split('/')
    num_categories = len(categories)
    if categories[0] != '':
        num_categories_dict[num_categories] += 1
    else:
        num_categories_dict['null'] += 1
#     if num_categories > 3:
#         print(num_categories, category)
        
print(num_categories_dict)
for k,v in num_categories_dict.items():
    print(f'Number of entries with {k} categories: {v}')

defaultdict(<class 'int'>, {3: 1471819, 'null': 6327, 5: 3059, 4: 1330})
Number of entries with 3 categories: 1471819
Number of entries with null categories: 6327
Number of entries with 5 categories: 3059
Number of entries with 4 categories: 1330


In [45]:
def format_string(s):
    """
    Takes a string as input and 'cleans' it.
    """
    # Lowercase
    s = s.lower()
    # Remove hyperlinks
    s = re.sub(r'https?:\/\/.*\/\w*', '', s)
    # Replace hyphens with a space
    s = s.replace('-', ' ')
    # Remove punctuation (important that we removed usernames & hyperlinks before this stage)
    s = re.sub(r'[^\w\s]', '', s)
    # Remove words with <=2 letters
    s = re.sub(r'\b\w{1,2}\b', '', s)
    # Replace excess whitespace with a single space, then strip the whitespace on the edges of the string
    s = re.sub(r'\s\s+', ' ', s)
    s = s.strip() 
    return s

In [46]:
# Choose how much of the dataset to use (e.g. N=5000 means use first 5k entries). Set = len(X) to use full dataset.
#n = len(X)
n = 5000 # we use first the first 'n' entries (use a low value for fast testing)

# We will store our cleaned strings in these variables (one for each column we are cleaning).
names_clean = np.empty(n, dtype='U50')
categories_clean = np.empty((n,3), dtype='U50')

start_time = time.time()
print('Cleaning data...')

counter = 10000
# For each row...
for i in range(len(X[:n])):
    row = X.iloc[i].loc[['name', 'category_name']] # row[0] = name, row[1] = category_name
    for col in range(len(row)):
        if col == 0:
            item = row[col]
            body_tokens = []
            # Format the name column
            tokens = format_string(item).split(' ')
            # If nothing is in this part of the tweet after formatting, can skip to next item
            if tokens[0] == '':
                continue
            else:
                # Filter tokens in the tweet item to remove stopwords & tokens that contain non-alphabet characters
                tokens = [t for t in tokens if t not in stopwords and t.isalpha()]
                # Put the tokens back together as one string
                joined_string = ' '.join(tokens)
                # Append it to the items in the body
                # (this is necessary because some tweets have their text body split into multiple items)
                body_tokens.append(joined_string)
            # Join the body into one string
            body = ' '.join(body_tokens)
            names_clean[i] = body
        else:
            category_name = row[col]
            categories = category_name.split('/')
            num_categories = len(categories)
            if num_categories < 3:
                categories_clean[i,:] = np.nan, np.nan, np.nan
            else:
                categories_clean[i,:] = categories[:3] # only use the first three categories -> ignore further subcategories

    
    # Every 1k iterations, update progress bar (or if we are about to finish).
    if counter == 10000 or i == n-1:
        j = (i + 1) / n
        sys.stdout.write('\r')
        sys.stdout.write("[%-20s] %d%%" % ('='*int(20*j), 100*j))
        sys.stdout.flush()
        # Reset counter.
        counter = 0   
    counter += 1
    
print(f'\nDone. Took {time.time()-start_time:.2f}s')

Cleaning data...
Done. Took 3.19s


In [48]:
# Put cleaned data into a new "cleaned" dataframe.
X_clean = X[:n]

X_clean.loc[:, 'name'] = names_clean

X_clean.loc[:, 'category1'] = categories_clean[:,0]
X_clean.loc[:, 'category2'] = categories_clean[:,1]
X_clean.loc[:, 'category3'] = categories_clean[:,2]

del X_clean['category_name']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,name,item_condition_id,brand_name,shipping,item_description,category1,category2,category3
1,mlb cincinnati reds shirt size,3,,1,No description yet,Men,Tops,T-shirts
2,razer blackwidow chroma keyboard,3,Razer,0,This keyboard is in great condition and works ...,Electronics,Computers & Tablets,Components & Parts
3,ava viv blouse,1,Target,1,Adorable top with a hint of lace and a key hol...,Women,Tops & Blouses,Blouse
4,leather horse statues,1,,1,New with tags. Leather horses. Retail for [rm]...,Home,Home Décor,Home Décor Accents
5,gold plated rose,1,,0,Complete with certificate of authenticity,Women,Jewelry,Necklaces
6,bundled items requested ruie,3,,0,"Banana republic bottoms, Candies skirt with ma...",Women,Other,Other
7,acacia pacific tides santorini top,3,Acacia Swimwear,0,Size small but straps slightly shortened to fi...,Women,Swimwear,Two-Piece
8,girls cheer tumbling bundle,3,Soffe,1,You get three pairs of Sophie cheer shorts siz...,Sports & Outdoors,Apparel,Girls
9,girls nike pro shorts,3,Nike,0,Girls Size small Plus green. Three shorts total.,Sports & Outdoors,Apparel,Girls
10,porcelain clown doll checker pants vtg,3,,0,I realized his pants are on backwards after th...,Vintage & Collectibles,Collectibles,Doll


In [62]:
X_clean.head()

Unnamed: 0,name,item_condition_id,brand_name,shipping,item_description,category1,category2,category3
1,mlb cincinnati reds shirt size,3,,1,No description yet,Men,Tops,T-shirts
2,razer blackwidow chroma keyboard,3,Razer,0,This keyboard is in great condition and works ...,Electronics,Computers & Tablets,Components & Parts
3,ava viv blouse,1,Target,1,Adorable top with a hint of lace and a key hol...,Women,Tops & Blouses,Blouse
4,leather horse statues,1,,1,New with tags. Leather horses. Retail for [rm]...,Home,Home Décor,Home Décor Accents
5,gold plated rose,1,,0,Complete with certificate of authenticity,Women,Jewelry,Necklaces


In [60]:
# Only include features I have been working on (temporary)
X_train = X_clean[:]
del X_train['item_condition_id']
del X_train['brand_name']
del X_train['shipping']
del X_train['item_description']

X_train.head()

Unnamed: 0,name,category1,category2,category3
1,mlb cincinnati reds shirt size,Men,Tops,T-shirts
2,razer blackwidow chroma keyboard,Electronics,Computers & Tablets,Components & Parts
3,ava viv blouse,Women,Tops & Blouses,Blouse
4,leather horse statues,Home,Home Décor,Home Décor Accents
5,gold plated rose,Women,Jewelry,Necklaces


### Vectorise words

In [77]:
# Vectorise words
print('Vectorising words...')
start_time = time.time()

vectorizer = CountVectorizer(analyzer='word', lowercase=True)

name_features = vectorizer.fit_transform(X_train.loc[:, 'name'])
cat1_features = vectorizer.fit_transform(X_train.loc[:, 'category1'])
cat2_features = vectorizer.fit_transform(X_train.loc[:, 'category2'])
cat3_features = vectorizer.fit_transform(X_train.loc[:, 'category3'])

# For testing
features = vectorizer.fit_transform(X_clean.loc[:, 'item_description'])

#df_train = pd.DataFrame(vectorizer.transform(X_train).todense(), columns=vectorizer.get_feature_names())
#vectorizer = CountVectorizer(analyzer='word', lowercase=True)

#X_clean.loc[:, 'name'] = vectorizer.fit_transform(X_clean.loc[:, 'name'])
#X_clean.loc[:, 'category_name'] = vectorizer.fit_transform(X_clean.loc[:, 'category_name'])
#X_clean.loc[:, 'item_description'] = vectorizer.fit_transform(X_clean.loc[:, 'item_description'])

print(f'Done. Time taken: {time.time() - start_time:.2f}s')

Vectorising words...
Done. Time taken: 0.18s


### ML algorithm

In [78]:
# Simple linear regression for testing
start_time = time.time()

# Fit model
print('Fitting model (Linear Regression)...')
model_lr = LinearRegression()
model_lr.fit(features, y[:n])

print(f'Done. Time taken: {time.time() - start_time:.2f}s\n')

# Predictions for train and test data
y_preds = model_lr.predict(features)

Fitting model (Linear Regression)...
Done. Time taken: 8.38s



### Cost function

Evaluation metric is Root Mean Squared Logarithmic Error (RMSLE).

In [79]:
# Mean Squared Error (MSE)
RMSE = np.sqrt(sum((y_preds - y[:n])**2) / n)
print(RMSE)

# Room Mean Squared Logarithmic Error (RMSLE)
#RMSLE = np.sqrt(sum((np.log(y_preds + 1) - np.log(y[:n] + 1))**2) / n)
#print(RMSLE)

23.857362674848975


In [80]:
print(max(y))
print(min(y))
print(np.mean(y))
print(np.std(y))

2009.0
0.0
26.737516146330442
38.586053299794486
