# Predict Price Change Effects
Using the reshaped listing price data, the average daily revenue change resulting from a price change is predicted using KNN, a random forest, and a feed-forward neural network.

## Import packages and load in data

In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score 
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers import Conv2D, MaxPooling2D, Flatten
from keras.losses import categorical_crossentropy
from keras.optimizers import SGD
from keras.utils import np_utils
import keras.callbacks as cb
from pathlib import Path
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000

In [2]:
# import data
dir = str(Path().resolve())
pc_df = pd.read_csv(dir + "/../data/price_changes.csv")
list_df = pd.read_csv(dir + "/../data/list_df.csv")

## Prepare data

In [3]:
# rename listing id column
list_df['id'] = list_df['listing_id']
list_df = list_df.drop('listing_id', axis=1)

In [4]:
print("Merging dataframes...")
# merge dataframes
df = pc_df.merge(list_df, on=['id', 'year'], how='left')
print("Done")

Merging dataframes...
Done


In [5]:
# remove rows with missing values
df = df.dropna()

# drop listing id
df = df.drop('id', axis=1)

In [21]:
print("df shape: ", df.shape)

df shape:  (5230391, 43)


In [6]:
def bin_adr(x):
    if x > 0.0:
        return 0
    elif x < 0.0:
        return 1
    else:
        return 2

In [7]:
# bin revenue
df['revenue_change'] = df.revenue_change.apply(bin_adr)

In [8]:
print("Splitting into X and y...")
# split into X and y
y = df['revenue_change']
X = df.drop('revenue_change', axis=1)

print("Getting dummy values...")
# get dummy values
X = pd.get_dummies(X)

print("Selecting features...")
# select 20 best features
X = pd.DataFrame(SelectKBest(score_func=f_classif, k=20).fit_transform(X, y.values))

print("Splitting into train and test...")
# split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y)
print("Done")

Splitting into X and y...
Getting dummy values...
Selecting features...


  f = msb / msw


Splitting into train and test...
Done


## Train models

In [12]:
print("Training KNN...")
knn = KNeighborsClassifier(n_neighbors=10).fit(X_train, y_train)
print("Predicting KNN...")
knn_pred = knn.predict(X_test)

Training KNN...
Predicting KNN...


In [13]:
print("Training random forest...")
rf = RandomForestClassifier(n_estimators=50).fit(X_train, y_train)
print("Predicting random forest...")
rf_pred = rf.predict(X_test)

Training random forest...
Predicting random forest...


In [15]:
# convert y values
nn_train = np_utils.to_categorical(y_train, 3)
nn_test = np_utils.to_categorical(y_test, 3)

# initialize feed-forward neural network
print("Initializing network...")
ffnn = Sequential()
ffnn.add(Dense(units=12, activation='relu', input_dim=20))
ffnn.add(Dropout(0.2))
ffnn.add(Dense(units=10, activation='exponential'))
ffnn.add(Dropout(0.2))
ffnn.add(Dense(units=8, activation='sigmoid'))
ffnn.add(Dropout(0.2))
ffnn.add(Dense(units=3, activation='relu'))

# compile model
print("Compiling model...")
ffnn.compile(loss=categorical_crossentropy, optimizer=SGD(lr=0.01, momentum=0.9, nesterov=True))

# fit model
print("Fitting model...")
ffnn.fit(X_train, nn_train, epochs=10, batch_size=128, verbose=2)

Initializing network...
Compiling model...
Fitting model...
Epoch 1/10
 - 59s - loss: nan
Epoch 2/10
 - 63s - loss: nan
Epoch 3/10
 - 59s - loss: nan
Epoch 4/10
 - 60s - loss: nan
Epoch 5/10
 - 68s - loss: nan
Epoch 6/10
 - 68s - loss: nan
Epoch 7/10
 - 58s - loss: nan
Epoch 8/10
 - 57s - loss: nan
Epoch 9/10
 - 58s - loss: nan
Epoch 10/10
 - 63s - loss: nan


<keras.callbacks.History at 0x1a1d88ce10>

In [16]:
# get ffnn predictions
ffnn_preds = ffnn.predict_classes(X_test)

## View results

In [19]:
print("KNN Results:")
print("accuracy: " + str(accuracy_score(y_test, knn_pred)))
print("precision: " + str(precision_score(y_test, knn_pred, average=None)))
print("recall: " + str(recall_score(y_test, knn_pred, average=None)))
print()
print("RF Results:")
print("accuracy: " + str(accuracy_score(y_test, rf_pred)))
print("precision: " + str(precision_score(y_test, rf_pred, average=None)))
print("recall: " + str(recall_score(y_test, rf_pred, average=None)))
print()
print("FFNN Results:")
print("accuracy: " + str(accuracy_score(y_test, ffnn_preds)))
print("precision: " + str(precision_score(y_test, ffnn_preds, average=None)))
print("recall: " + str(recall_score(y_test, ffnn_preds, average=None)))

KNN Results:
accuracy: 0.6353007575722814
precision: [0.50947136 0.50497735 0.70702975]
recall: [0.46252035 0.39264124 0.78957387]

RF Results:
accuracy: 0.6187023840660508
precision: [0.49549426 0.47101407 0.69487956]
recall: [0.43826491 0.38092479 0.77402013]

FFNN Results:
accuracy: 0.2137705931027732
precision: [0.21376999 1.         0.        ]
recall: [1.0000000e+00 3.5984167e-06 0.0000000e+00]


## Discussion
One contribution to the mediocre results could be the fact that there are significantly more "no change" observations than the other two.  

In [18]:
np.bincount(y_train.values)

array([ 839331,  833825, 2249637])