In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('./Data/eBayiPadTrain_copy.csv', header=0)

In [None]:
df.head()

In [None]:
df.biddable.nunique()
df.info()
df.condition.nunique()
df.condition.unique()

In [None]:
description = df.description.fillna('')
description.head()

In [7]:
BASIC_PATTERN = '\\S+(?=\\s+)'
ALPHANUMERIC_PATTERN = '[A-Za-z0-9]+(?=\\s+)'

In [8]:
vec_basic = CountVectorizer(token_pattern=BASIC_PATTERN)
vec_alphanumeric = CountVectorizer(token_pattern=ALPHANUMERIC_PATTERN)

In [9]:
vec_basic.fit_transform(description)
print("There are {} tokens are in the dataset.".format(len(vec_basic.get_feature_names())))

There are 1337 tokens are in the dataset.


In [10]:
vec_alphanumeric.fit_transform(description)
print("There are {} tokens are in the dataset.".format(len(vec_alphanumeric.get_feature_names())))

There are 834 tokens are in the dataset.


In [29]:
X = df
y = df.sold
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=64)

In [12]:
def get_description(data_frame):
    return data_frame.description.fillna('')

In [13]:
TEXT_COLUMNS = ['description']
NUMERIC_COLUMNS = ['biddable', 'startprice']

In [24]:
LABELS = ['condition', 'cellular', 'carrier', 'color', 'storage', 'productline']
categorize_label = lambda x: x.astype('category')
df[LABELS] = df[LABELS].apply(categorize_label, axis=0)
for lbl in LABELS:
    df[lbl+'_cat'] = df[lbl].cat.codes

In [15]:
NUMERIC_COLS = ['biddable', 'startprice', 'condition_cat', 'cellular_cat', 'carrier_cat', 'color_cat', 'storage_cat', 'productline_cat']

In [16]:
get_text_data = FunctionTransformer(get_description, validate=False)
pl = Pipeline([
    ('text_features', Pipeline([
        ('selector', get_text_data),
        ('vectorizer', CountVectorizer(token_pattern=BASIC_PATTERN))
    ])),
    ('clf', RandomForestClassifier(n_estimators=30))
])

In [17]:
pl.fit(X_train, y_train)
accuracy = pl.score(X_test, y_test)

print("\nAccuracy: ", accuracy)


Accuracy:  0.5867620751341681


In [18]:
pred = pl.predict(X_test)
#X_test.head()

In [31]:
#get_text_data = FunctionTransformer(lambda x: x[TEXT_COLUMNS], validate=False)
get_text_data = FunctionTransformer(get_description, validate=False)
get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLS], validate=False)
get_non_numeric_data = FunctionTransformer(lambda x: x[NON_NUMERIC_COLS], validate=False)

pl = Pipeline([
    ('union', FeatureUnion(
    transformer_list=[
        ('numeric_features', Pipeline([
            ('selector', get_numeric_data)
        ])),
        #('non_numeric_features', Pipeline([
        #    ('selector', get_non_numeric_data)
        #])),
        ('text_features', Pipeline([
            ('selector', get_text_data),
            ('vectorizer', CountVectorizer(token_pattern=BASIC_PATTERN))
        ]))
    ])),
    ('clf', RandomForestClassifier(n_estimators=30))
])

In [32]:
pl.fit(X_train, y_train)
accuracy = pl.score(X_test, y_test)

print("\nAccuracy: ", accuracy)


Accuracy:  0.8121645796064401
