In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import FunctionTransformer, MaxAbsScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
df = pd.read_csv('./Data/eBayiPadTrain_copy.csv', header=0)

In [None]:
BASIC_PATTERN = '\\S+(?=\\s+)'
ALPHANUMERIC_PATTERN = '[A-Za-z0-9]+(?=\\s+)'

In [None]:
def get_description(data_frame):
    return data_frame.description.fillna('')

In [None]:
TEXT_COLUMNS = ['description']

NUMERIC_COLUMNS = ['biddable', 'startprice']

LABELS = ['condition', 'cellular', 'carrier', 'color', 'storage', 'productline']

CAT_COLS = ['condition_cat', 'cellular_cat', 'carrier_cat', 'color_cat', 'storage_cat', 'productline_cat']

In [None]:
categorize_label = lambda x: x.astype('category')
df[LABELS] = df[LABELS].apply(categorize_label, axis=0)
for lbl in LABELS:
    df[lbl+'_cat'] = df[lbl].cat.codes

In [None]:
X = df
y = df.sold
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=64)

In [None]:
get_text_data = FunctionTransformer(get_description, validate=False)
get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS], validate=False)
get_categoric_data = FunctionTransformer(lambda x: x[CAT_COLS], validate=False)

pl = Pipeline([
    ('union', FeatureUnion(
    transformer_list=[
        ('numeric_features', Pipeline([
            ('selector', get_numeric_data)
        ])),
        ('categoric_features', Pipeline([
            ('selector', get_categoric_data)
        ])),
        ('text_features', Pipeline([
            ('selector', get_text_data),
            ('vectorizer', CountVectorizer(token_pattern=ALPHANUMERIC_PATTERN))
        ]))
    ])),
    ('clf', RandomForestClassifier(n_estimators=60))
])

In [None]:
pl.fit(X_train, y_train)
accuracy = pl.score(X_test, y_test)

print("\nAccuracy: ", accuracy)

In [None]:
df_test = pd.read_csv('./Data/eBayiPadTest_copy.csv', header=0)

In [None]:
df_test.head()

In [None]:
df_test[LABELS] = df_test[LABELS].apply(categorize_label, axis=0)
for lbl in LABELS:
    df_test[lbl+'_cat'] = df_test[lbl].cat.codes

In [None]:
df_test.head()

In [None]:
prob = pl.predict_proba(df_test)

In [None]:
ret = pd.DataFrame(data={'Probability1': prob[:, 0]}, index=df_test.UniqueID)

In [None]:
ret.head()
#type(ret)
#ret.columns
#prob[:,0]
ret.to_csv('result_01.csv')

In [None]:
from myutils.sparseinteractions import SparseInteractions
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import chi2, SelectKBest
chi_k = 300

In [None]:
%%time
pl = Pipeline([
    ('union', FeatureUnion(
    transformer_list=[
        ('numeric_features', Pipeline([
            ('selector', get_numeric_data)
        ])),
        ('categoric_features', Pipeline([
            ('selector', get_categoric_data)
        ])),
        ('text_features', Pipeline([
            ('selector', get_text_data),
            ('vectorizer', CountVectorizer(token_pattern=BASIC_PATTERN)),
            ('dim_red', SelectKBest(chi2, chi_k))
        ]))
    ])),
    ('int', SparseInteractions(degree=2)),
    ('scale', MaxAbsScaler()),
    ('clf', LogisticRegression(solver='lbfgs', max_iter=1000))
])

pl.fit(X_train, y_train)
accuracy = pl.score(X_test, y_test)

print("\nAccuracy: ", accuracy)