In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import FunctionTransformer, MaxAbsScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.preprocessing import StandardScaler

In [None]:
chi_k = 400

In [None]:
df = pd.read_csv('./Data/eBayiPadTrain_copy.csv', header=0)

In [None]:
ALPHANUMERIC_PATTERN = '[A-Za-z0-9]+(?=\\s+)'

In [None]:
def get_description(data_frame):
    return data_frame.description.fillna('')

In [None]:
TEXT_COLUMNS = ['description']
NUMERIC_COLUMNS = ['biddable', 'startprice']
LABELS = ['condition', 'cellular', 'carrier', 'color', 'storage', 'productline']

In [None]:
categorize_label = lambda x: x.astype('category')
df[LABELS] = df[LABELS].apply(categorize_label, axis=0)

In [None]:
get_text_data = FunctionTransformer(get_description, validate=False)
get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS], validate=False)
get_categoric_data = FunctionTransformer(lambda x: pd.get_dummies(x[LABELS]), validate=False)

In [None]:
X = df
y = df.sold
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=64)

In [None]:
stop_words = text.ENGLISH_STOP_WORDS.union(['apple', 'ipad', 'mini'])

In [None]:
pl = Pipeline([
    ('union', FeatureUnion(
    transformer_list=[
        ('numeric_features', Pipeline([
            ('selector', get_numeric_data),
            #('scaler', StandardScaler())
        ])),
        ('categoric_features', Pipeline([
            ('selector', get_categoric_data)
        ])),
        ('text_features', Pipeline([
            ('selector', get_text_data),
            ('vectorizer', CountVectorizer(token_pattern=ALPHANUMERIC_PATTERN)),
            ('dim_red', SelectKBest(chi2, chi_k))
        ]))
    ])),
    ('clf', RandomForestClassifier(n_estimators=50))
    #('clf', LogisticRegression(solver='liblinear', penalty='l2'))
])

In [None]:
pl.fit(X_train, y_train)
accuracy = pl.score(X_test, y_test)

print("\nAccuracy: ", accuracy)

In [None]:
df.productline.unique()