### Import and Load

In [1]:
import pandas as pd
import numpy as np
import spacy
import time
import matplotlib.pyplot as plt

train_path = "C:\\Users\\blamarre\\Documents\\Kaggle\\train.tsv"
train = pd.read_csv(train_path, sep='\t')



### NLP Setup

Load the NLP model. Don't use the Dependency Parser or Named Entity Recognition for performance reasons at the moment.

In [2]:
nlp = spacy.load('en', parser=False, ner=False)

### Generate our Bag of Words

In [3]:
#Choose the first 10 descriptions for testing purposes
descriptions=train[train.train_id<10].item_description.astype(str)


#Initialize sets
word_set = set()
lemma_set = set()

#Loop through descriptions, running nlp over each string.
#Append tokens to sets for those that aren't punctuation or part of the stop list
for i in range(len(descriptions)):  
    doc=nlp.tokenizer(descriptions[i])
    for token in doc:
        if token.is_punct == 0 & token.is_stop == 0:
            word_set.add(str.lower(token.text))
            lemma_set.add(str.lower(token.lemma_))


### Define a function that will compare words in item_description to our word bag
We'll map this function to the train.item_description series (column) for each word in the word bag, saving off a 1 if the description contains the word, 0 otherwise.

In [4]:
def series_contains(series,word):
    return series.map(lambda x: word in x).astype(int)

### Generate our One Hot Encoded Matrix

In [5]:
for word in word_set:
    train[word]=series_contains(descriptions,word)

### View our glorious result 
The NaN values are because I only pull the first 10 descriptions into the word bag at the moment.

In [6]:
train

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,feet,girls,...,adorable,box,banana,scary,washed,lights,ask,shortened,at,synapse
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,0.0,0.0,No description yet,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,0.0,0.0,This keyboard is in great condition and works ...,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,0.0,0.0,Adorable top with a hint of lace and a key hol...,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,0.0,1.0,New with tags. Leather horses. Retail for [rm]...,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,0.0,0.0,Complete with certificate of authenticity,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5,5,Bundled items requested for Ruie,3,Women/Other/Other,,0.0,0.0,"Banana republic bottoms, Candies skirt with ma...",0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
6,6,Acacia pacific tides santorini top,3,Women/Swimwear/Two-Piece,Acacia Swimwear,0.0,0.0,Size small but straps slightly shortened to fi...,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
7,7,Girls cheer and tumbling bundle of 7,3,Sports & Outdoors/Apparel/Girls,Soffe,1.0,0.0,You get three pairs of Sophie cheer shorts siz...,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8,8,Girls Nike Pro shorts,3,Sports & Outdoors/Apparel/Girls,Nike,0.0,0.0,Girls Size small Plus green. Three shorts total.,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,9,Porcelain clown doll checker pants VTG,3,Vintage & Collectibles/Collectibles/Doll,,0.0,0.0,I realized his pants are on backwards after th...,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
