# Analyzing text in Ebay Auction Titles

The goal of the following code is to use language processing techniques to parse the ebay auction titles, seeing if there is any correlation between certain words and success of the auction.

In [5]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import csv, json, time, sklearn
from pattern.en import parse
from pattern.en import pprint
from pattern.vector import stem, PORTER, LEMMA
from sklearn.feature_extraction import text 
import re

We first load in the data, although we are only concerned with a few of these columns

In [8]:
with open('dftouse.csv', 'r') as fd:
    dftouse = pd.read_csv(fd)


In [10]:
dftouse.head()

Unnamed: 0.1,Unnamed: 0,country,itemId,paymentMethod,postalCode,sellingStatus,topRatedListing,AutoPay,AvailableForPickupDropOff,BestOfferEnabled,BidCount,BuyItNowPrice,FinalPrice,GlobalShipping,HandlingTime,HitCount,ListingStatus,ListingType,Location,MinimumToBid,ProductID,Quantity,QuantityThreshold,ReserveMet,ShipToLocations,Title,UserID,FeedbackScore,PositiveFeedbackPercent,SellerBusinessType,NewUser,nPhotos,nPaymentMethods,nShippingCountries,conditionId,bestOfferEnabled,buyItNowAvailable,expeditedShipping,shippingType,oneDayShippingAvailable,RatingsPresent,AverageRating,NegFeedback30Day,NegFeedback365Day,PosFeedback30Day,PosFeedback365Day,auctionLength
0,0,0,131642735687,0,10590,0,False,False,0,False,41,,202.09,False,2,190,Completed,0,"South Salem, New York",204.59,,1,0,,US,iPhone 5s-32gb-Space Gray-unlocked,jdwein1980,19,100.0,2,False,3,1,1,5,False,False,True,2,False,False,,0,0,5,5,168
1,1,0,262126933108,0,35004,0,False,False,0,False,4,,260.0,False,3,32,Completed,0,"Moody, Alabama",265.0,,1,0,,US,apple iphone 5s 32gb,forwhlrcr1965,565,100.0,1,False,2,1,1,5,False,False,True,0,False,False,,0,0,3,32,120
2,2,0,252155964081,0,98104,0,False,False,0,False,3,,152.5,False,2,168,Completed,0,"Seattle, Washington",155.0,168534287.0,1,0,,Worldwide,Apple iPhone 5s - 32GB SPACE GRAY AT&T,15-alice,15,94.1,1,False,2,1,1,5,False,False,False,1,False,False,,0,1,2,17,120
3,3,0,252163774777,0,98104,0,False,False,0,False,1,,99.0,False,2,143,Completed,0,"Seattle, Washington",100.0,168553370.0,1,0,,US,Apple iPhone 5s - 32GB - Silver (Verizon) Smar...,15-alice,15,94.1,1,False,7,1,1,9,False,False,True,0,False,False,,0,1,2,17,120
4,4,0,252171170160,0,98104,0,False,True,0,False,5,,96.0,False,2,357,Completed,0,"Seattle, Washington",97.0,168534287.0,1,0,,US,Apple iPhone 5s - 32GB SPACE GRAY AT&T,15-alice,15,94.1,1,False,2,1,1,5,False,False,False,1,False,False,,0,1,2,17,120


Time to parse the titles. This is done almost exactly like in HW5, except we want to broaden our parsing. We don't care if there are no descriptors or if we are working with pronouns. Right now, we just want to see what people have been using as ebay titles

In [40]:
punctuation = list('.,;:!?()[]{}`''\"@#$^&*+-|=~_')
from sklearn.feature_extraction import text 
stopwords=text.ENGLISH_STOP_WORDS
import re
regex1=re.compile(r"\.{1,}")
regex2=re.compile(r"\-{1,}")

In [58]:
def get_parts(thetext):
    thetext=re.sub(regex1, ' ', thetext)
    thetext=re.sub(regex2, ' ', thetext)
    nouns=[]
    descriptives=[]
    for i,sentence in enumerate(parse(thetext, tokenize=True, lemmata=True).split()):
        nouns.append([])
        descriptives.append([])
        for token in sentence:
            if len(token[4]) >0:
                # this checks if token is an adjective
                if token[1] in ['JJ', 'JJR', 'JJS']:
                    # this checks if token is in stopwords, or is a punctuation, or is a single letter
                    if token[4] in stopwords or token[4][0] in punctuation or token[4][-1] in punctuation or len(token[4])==1:
                        continue
                    descriptives[i].append(token[4])
                # anything else is considered a noun for now
                else:
                    if token[4] in stopwords or token[4][0] in punctuation or token[4][-1] in punctuation or len(token[4])==1:
                        continue
                    nouns[i].append(token[4])
    return nouns, descriptives

Now we run get_parts on all of the titles of the auctions in our data. Let's see what we get

In [59]:
test = dftouse['Title'][0:5].map(lambda x: get_parts(x))

In [61]:
test[0][0]

[[u'iphone', u'5s', u'32gb', u'space', u'gray', u'unlock']]

In [204]:
titles = dftouse['Title'].map(lambda x: [y.lower() for y in re.split('[.,;:!?(){}`\"@#$*|=~_ +-]',x)])

In [205]:
uniqueWords = set()
for line in titles:
    for word in line:
        
        # assumption here that punctuation does not affect ebay auctions
        if word not in uniqueWords:
            uniqueWords.add(word)


In [206]:
len(uniqueWords)

1430

In [207]:
pos_resp_count = {}
neg_resp_count = {}
dfnew = dftouse.copy()
dfnew['Title'] = titles
# create a dict with keys of unique words
for word in uniqueWords:
    pos_resp_count[word] = 0
    neg_resp_count[word] = 0

    
def pos_counter(sentence):
    for word in sentence:
        pos_resp_count[word] += 1
def neg_counter(sentence):
    for word in sentence:
        neg_resp_count[word] += 1


In [208]:
responses = dfnew['sellingStatus']
responses[:5]

0    0
1    0
2    0
3    0
4    0
Name: sellingStatus, dtype: int64

In [209]:
for i in range(len(responses)):
    if responses[i] == 0:
        pos_counter(titles[i])
    else:
        neg_counter(titles[i])
del pos_resp_count['']
del neg_resp_count['']

In [210]:
new_pos = []
new_neg = []
for key in pos_resp_count.keys():
    if pos_resp_count[key] > 100:
        new_pos.append((key,pos_resp_count[key]))
    if neg_resp_count[key] > 100:
        new_neg.append((key,neg_resp_count[key]))
new_pos.sort(key=lambda tup: tup[1])
new_neg.sort(key=lambda tup: tup[1])
        
new_neg

[('new', 118),
 ('esn', 148),
 ('condition', 185),
 ('clean', 193),
 ('sprint', 219),
 ('black', 329),
 ('gsm', 467),
 ('white', 504),
 ('silver', 515),
 ('factory', 608),
 ('t', 683),
 ('mobile', 704),
 ('gold', 777),
 ('verizon', 866),
 ('space', 1049),
 ('gray', 1093),
 ('at&t', 1560),
 ('unlocked', 1595),
 ('smartphone', 1648),
 ('apple', 2964),
 ('32gb', 3083),
 ('5s', 3272),
 ('iphone', 3289)]

In [211]:
new_pos

[('w/', 102),
 ('imei', 102),
 ('bundle', 112),
 ('used', 117),
 ('free', 125),
 ('&', 125),
 ('grey', 130),
 ('gb', 131),
 ('32', 138),
 ('box', 144),
 ('mint', 148),
 ('good', 151),
 ('black', 154),
 ('new', 161),
 ('with', 168),
 ('case', 175),
 ('great', 180),
 ('esn', 226),
 ('excellent', 229),
 ('gsm', 232),
 ('white', 233),
 ('sprint', 308),
 ('t', 319),
 ('clean', 324),
 ('mobile', 350),
 ('condition', 452),
 ('factory', 1115),
 ('silver', 1384),
 ('gold', 1680),
 ('verizon', 1747),
 ('unlocked', 2123),
 ('at&t', 2461),
 ('gray', 2614),
 ('space', 2639),
 ('smartphone', 4512),
 ('apple', 5933),
 ('32gb', 6076),
 ('5s', 6312),
 ('iphone', 6344)]