In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import FunctionTransformer, MaxAbsScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.preprocessing import StandardScaler

In [None]:
chi_k = 400

In [None]:
df = pd.read_csv('./Data/eBayiPadTrain_copy.csv', header=0)
df_test = pd.read_csv('./Data/eBayiPadTest_copy.csv', header=0)

In [None]:
description = np.concatenate((df.description.fillna(''), df_test.description.fillna('')))

In [None]:
stop_words = text.ENGLISH_STOP_WORDS.union(['apple', 'ipad', 'mini'])
ALPHANUMERIC_PATTERN = '[A-Za-z0-9]+(?=\\s+)'
vec = CountVectorizer(token_pattern=ALPHANUMERIC_PATTERN, stop_words=stop_words)
vec.fit(description)
vocabulary = vec.get_feature_names()
print("There are {} tokens in the dataset".format(len(vocabulary)))

In [None]:
TEXT_COLUMNS = ['description']
NUMERIC_COLUMNS = ['biddable', 'startprice']
LABELS = ['condition', 'cellular', 'carrier', 'color', 'storage', 'productline']

In [None]:
categorize_label = lambda x: x.astype('category')
df[LABELS] = df[LABELS].apply(categorize_label, axis=0)

In [None]:
def get_description(data_frame):
    return data_frame.description.fillna('')

In [None]:
get_text_data = FunctionTransformer(get_description, validate=False)
get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS], validate=False)
get_categoric_data = FunctionTransformer(lambda x: pd.get_dummies(x[LABELS]), validate=False)

In [None]:
X = df
y = df.sold
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=64)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import (LinearRegression, TheilSenRegressor, RANSACRegressor, HuberRegressor)
from sklearn.pipeline import make_pipeline
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD
from keras.wrappers.scikit_learn import KerasClassifier
from keras.callbacks import EarlyStopping

def getmodel():
    model = Sequential()
    input_shape=(844,)
    model.add(Dense(500, activation='relu', input_shape=input_shape))
    model.add(Dense(500, activation='relu', input_shape=input_shape))
    model.add(Dense(500, activation='relu', input_shape=input_shape))
    model.add(Dense(500, activation='relu', input_shape=input_shape))
    model.add(Dense(500, activation='relu', input_shape=input_shape))
    model.add(Dense(2, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

early_stopping_monitor = EarlyStopping(patience=3)
#clf = KerasClassifier(getmodel, epochs=100, verbose=1, validation_split=0.20, callbacks=[early_stopping_monitor])
clf = KerasClassifier(getmodel, epochs=100, verbose=1, callbacks=[early_stopping_monitor])

In [None]:
pl = Pipeline([
    ('union', FeatureUnion(
    transformer_list=[
        ('numeric_features', Pipeline([
            ('selector', get_numeric_data),
            #('scaler', StandardScaler())
        ])),
        ('categoric_features', Pipeline([
            ('selector', get_categoric_data)
        ])),
        ('text_features', Pipeline([
            ('selector', get_text_data),
            ('vectorizer', CountVectorizer(vocabulary=vocabulary))
            #('dim_red', SelectKBest(chi2, chi_k))
        ]))
    ])),
    ('clf', clf)
    #('clf', make_pipeline(PolynomialFeatures(2, interaction_only=True), LinearRegression()))
    #('clf', KNeighborsClassifier(n_neighbors=5))
    #('clf', RandomForestClassifier(n_estimators=250))
    #('clf', svm.SVC(gamma='scale'))
    #('clf', MultinomialNB())
    #('clf', AdaBoostClassifier(n_estimators=300))
    #('clf', LogisticRegression(solver='newton-cg', penalty='l2'))
])

In [None]:
pl.fit(X_train, y_train)
accuracy = pl.score(X_test, y_test)

print("\nAccuracy: ", accuracy)