# Predict Price Change Effects
Using the reshaped listing price data, the average daily revenue change resulting from a price change is predicted using KNN, a random forest, and a feed-forward neural network.

## Import packages and load in data

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score 
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers import Conv2D, MaxPooling2D, Flatten
from keras.losses import categorical_crossentropy
from keras.optimizers import SGD
from keras.utils import np_utils
import keras.callbacks as cb
from pathlib import Path
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000

In [2]:
# import data
dir = str(Path().resolve())
pc_df = pd.read_csv(dir + "/../data/price_changes.csv")
list_df = pd.read_csv(dir + "/../data/list_df.csv")

## Prepare data

In [3]:
# rename listing id column
list_df['id'] = list_df['listing_id']
list_df = list_df.drop('listing_id', axis=1)

In [4]:
print("Merging dataframes...")
# merge dataframes
df = pc_df.merge(list_df, on=['id', 'year'], how='left')
print("Done")

Merging dataframes...
Done


In [5]:
# remove rows with missing values
df = df.dropna()

# drop listing id
df = df.drop('id', axis=1)

In [7]:
print("df shape: ", df.shape)

df shape:  (5230391, 43)


In [8]:
def bin_adr(x):
    if x > 0.0:
        return 0
    elif x < 0.0:
        return 1
    else:
        return 2

In [9]:
# bin revenue
df['revenue_change'] = df.revenue_change.apply(bin_adr)

In [11]:
print("Splitting into X and y...")
# split into X and y
y = df['revenue_change']
X = df.drop('revenue_change', axis=1)

# manually select columns to drop
X = X.drop(['calculated_host_listings_count', 'host_listings_count', 'last_scraped', 'latitude', 'longitude', 'review_scores_checkin', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_communication', 'review_scores_location', 'review_scores_rating'], axis=1)

print("Getting dummy values...")
# get dummy values
X = pd.get_dummies(X)

print("Splitting into train and test...")
# split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y)
print("Done")

Splitting into X and y...
Getting dummy values...
Splitting into train and test...
Done


## Train models

In [None]:
print("Training random forest...")
rf = RandomForestClassifier(n_estimators=50).fit(X_train, y_train)
print("Predicting random forest...")
rf_pred = rf.predict(X_test)

In [None]:
# convert y values
nn_train = np_utils.to_categorical(y_train, 3)
nn_test = np_utils.to_categorical(y_test, 3)

# initialize feed-forward neural network
print("Initializing network...")
ffnn = Sequential()
ffnn.add(Dense(units=12, activation='relu', input_dim=20))
ffnn.add(Dropout(0.2))
ffnn.add(Dense(units=10, activation='exponential'))
ffnn.add(Dropout(0.2))
ffnn.add(Dense(units=8, activation='sigmoid'))
ffnn.add(Dropout(0.2))
ffnn.add(Dense(units=3, activation='relu'))

# compile model
print("Compiling model...")
ffnn.compile(loss=categorical_crossentropy, optimizer=SGD(lr=0.01, momentum=0.9, nesterov=True))

# fit model
print("Fitting model...")
ffnn.fit(X_train, nn_train, epochs=10, batch_size=128, verbose=2)

In [None]:
# get ffnn predictions
ffnn_preds = ffnn.predict_classes(X_test)

## View results

In [None]:
print("RF Results:")
print("accuracy: " + str(accuracy_score(y_test, rf_pred)))
print("precision: " + str(precision_score(y_test, rf_pred, average=None)))
print("recall: " + str(recall_score(y_test, rf_pred, average=None)))
print()
print("FFNN Results:")
print("accuracy: " + str(accuracy_score(y_test, ffnn_preds)))
print("precision: " + str(precision_score(y_test, ffnn_preds, average=None)))
print("recall: " + str(recall_score(y_test, ffnn_preds, average=None)))

## Discussion
One contribution to the mediocre results could be the fact that there are significantly more "no change" observations than the other two.  

In [None]:
np.bincount(y_train.values)