### Necessary imports 

In [1]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import Normalizer
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn import neighbors
from sklearn.model_selection import cross_val_score

### Reading in CSV files and updating data

In [2]:
# reading in the csv and xlsx files
test = pd.read_csv('test.csv', parse_dates = ['timestamp'])
train = pd.read_csv('train.csv', parse_dates = ['timestamp'])
macro_df = pd.read_csv('macro.csv', parse_dates = ['timestamp'])
data_fix = pd.read_excel('BAD_ADDRESS_FIX.xlsx').drop_duplicates('id').set_index('id')

# updating the data
test.update(data_fix)
train.update(data_fix)

# drop id
train.drop(['id'], axis = 1, inplace = True)
test.drop(['id'], axis = 1, inplace = True)

# get feature columns
train_features = list(train)
test_features = list(test)

### Drop NaN and initialize our Training and Testing sets

In [3]:
# drops any rows with NaN
test = test.dropna(how = 'any')
train = train.dropna(how = 'any')

# set training and testing sets
X_train = train[train_features[:-1]]
y_train = train['price_doc'].values
X_test = test[test_features]
y_test = train.sample(n = len(X_test.index))['price_doc']

print('X_train: {}\nX_test: {}\ny_train: {}\nMacro: {}'.format(X_train.shape, X_test.shape, y_train.shape, macro_df.shape))

X_train: (6076, 290)
X_test: (2443, 290)
y_train: (6076,)
Macro: (2484, 100)


### Merging Macro data set into Training and Testing

In [4]:
# most relevant columns taken from: https://www.kaggle.com/robertoruiz/dealing-with-multicollinearity# 
macro_cols = ["timestamp","balance_trade","balance_trade_growth","eurrub","average_provision_of_build_contract",
              "micex_rgbi_tr","micex_cbi_tr","deposits_rate","mortgage_value","mortgage_rate"]

# concatenate the training and testing data into one
df = pd.concat([X_train, X_test])
print('Before merge: {}'.format(df.shape))

# merge the macro data with testing and training
df = df.merge(macro_df[macro_cols], on = 'timestamp', how = 'left')
print('Merged with Macro: {}'.format(df.shape))

# dropping timestamp
df.drop(['timestamp', 'product_type'], axis = 1, inplace = True)
print('After dropping timestamp and product type {}'.format(df.shape))

Before merge: (8519, 290)
Merged with Macro: (8519, 299)
After dropping timestamp and product type (8519, 297)


### Spliting Numeric and Object data

In [5]:
# splitting numeric data and object data
# factorize to turn characters into numeric data
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.select_dtypes.html
df_num = df.select_dtypes(exclude=['object'])
df_obj = df.select_dtypes(include=['object']).copy()
for c in df_obj:
    df_obj[c] = pd.factorize(df_obj[c])[0]

print(df_num.shape, df_obj.shape)
df_values = pd.concat([df_num, df_obj], axis=1)
print(df_values.shape)
#df_csv = df_values.to_csv('new-training-and-testing.csv')

(8519, 283) (8519, 14)
(8519, 297)


In [6]:
# new training and testing data based on the macro, train, and test csv
train_df = df_values[:X_train.shape[0]]
test_df = df_values[X_train.shape[0]:]
print('train_df: {}\ntest_df: {}\ny_train: {} '.format(train_df.shape, test_df.shape, y_train.shape))

train_df: (6076, 297)
test_df: (2443, 297)
y_train: (6076,) 


### Normalizing Data

In [31]:
# normalizing data
scaler = Normalizer().fit(train_df)
normalized_training = scaler.transform(train_df)
normalized_testing = scaler.transform(test_df)
print('train_df: {}\ntest_df: {}\ny_train: {} '.format(normalized_training.shape, normalized_testing.shape, y_train.shape))

train_df: (6076, 297)
test_df: (2443, 297)
y_train: (4860,) 


### Linear Regression without PCA

In [8]:
# fitting data
lin_reg = LinearRegression(normalize = True, n_jobs = -1)
start = time.time()
lin_reg.fit(normalized_training, y_train)
end = time.time()
elapsed = end - start
print(elapsed / 60)

0.0033559560775756835


In [9]:
y_predict_lr = lin_reg.predict(normalized_testing)
mse = metrics.mean_squared_error(y_test, y_predict_lr)
rmse = np.sqrt(mse)
print("Rounded Mean Square Error: {:.5f}".format(rmse))

lin_reg_accuracy = cross_val_score(lin_reg, normalized_training, y_train, cv = 10, scoring = 'neg_mean_squared_error', n_jobs = -1)
positive_lin_reg = -lin_reg_accuracy
rmse_list = np.sqrt(positive_lin_reg)
print("Rounded Mean Square Error using Cross Validation: {:.5f}".format(rmse_list.mean()))

Rounded Mean Square Error: 7680624.15944
Rounded Mean Square Error using Cross Validation: 18720741.03859


### Linear Regression with PCA

In [10]:
# pca
n = 10
pca = PCA(n_components = n)
pca_training = pca.fit_transform(normalized_training)
pca_testing = pca.fit_transform(normalized_testing)

print(pca_training.shape, pca_testing.shape)

(6076, 10) (2443, 10)


In [11]:
lin_reg.fit(pca_training, y_train)
pca_y_predict_lr = lin_reg.predict(pca_testing)
mse = metrics.mean_squared_error(y_test, pca_y_predict_lr)
rmse = np.sqrt(mse)
print("Rounded Mean Square Error: {:.5f}".format(rmse))

lin_reg_accuracy = cross_val_score(lin_reg, pca_training, y_train, cv = 10, scoring = 'neg_mean_squared_error', n_jobs = -1)
positive_lin_reg = -lin_reg_accuracy
rmse_list = np.sqrt(positive_lin_reg)
print("Rounded Mean Square Error using Cross Validation: {:.5f}".format(rmse_list.mean()))

Rounded Mean Square Error: 6507949.81181
Rounded Mean Square Error using Cross Validation: 6018836.02765


### Decision Tree without PCA

In [12]:
decision_tree = DecisionTreeClassifier(random_state=2)
decision_tree.fit(normalized_training, y_train)
y_predict_dt = decision_tree.predict(normalized_testing)
score_dt = accuracy_score(y_test, y_predict_dt)

print("Decision Tree Score: {:.5f}".format(score_dt))

Decision Tree Score: 0.00941


### Decision Tree with PCA

In [13]:
decision_tree = DecisionTreeClassifier(random_state=2)
decision_tree.fit(pca_training, y_train)
pca_y_predict_dt = decision_tree.predict(pca_testing)
pca_score_dt = accuracy_score(y_test, pca_y_predict_dt)

print("Decision Tree Score: {:.5f}".format(pca_score_dt))

Decision Tree Score: 0.01064


### Random Forest without PCA

In [14]:
random_forest = RandomForestClassifier(n_estimators = 19, bootstrap = True, random_state = 2, n_jobs = -1)
random_forest.fit(normalized_training, y_train)
y_predict_rf = random_forest.predict(normalized_testing)
score_rf = accuracy_score(y_test, y_predict_rf)

print("Random Forest Score: {:.5f}".format(score_rf))

Random Forest Score: 0.01228


### Random Forest with PCA

In [15]:
random_forest = RandomForestClassifier(n_estimators = 19, bootstrap = True, random_state = 2, n_jobs = -1)
random_forest.fit(pca_training, y_train)
pca_y_predict_rf = random_forest.predict(pca_testing)
pca_score_rf = accuracy_score(y_test, pca_y_predict_rf)

print("Random Forest Score: {:.5f}".format(pca_score_rf))

Random Forest Score: 0.01801


### KNN without PCA

In [16]:
knn = neighbors.KNeighborsClassifier(n_neighbors = 5)
knn.fit(normalized_training, y_train)
knn_predict = knn.predict(normalized_testing)
knn_score = accuracy_score(y_test, knn_predict)
print('KNN Accuracy without PCA: {:.5f}'.format(knn_score))

KNN Accuracy without PCA: 0.01883


### KNN with PCA

In [17]:
knn.fit(pca_training, y_train)
knn_predict = knn.predict(pca_testing)
knn_score = accuracy_score(y_test, knn_predict)
print('KNN Accuracy with PCA: {:.5f}'.format(knn_score))

KNN Accuracy with PCA: 0.02251


### Cross-Validation

In [18]:
decision_tree_score = cross_val_score(decision_tree, normalized_training, y_train, cv = 10, scoring = 'accuracy', n_jobs = -1)
knn_score = cross_val_score(knn, normalized_training, y_train, cv = 10, scoring = 'accuracy', n_jobs = -1)
cv_random_tree = cross_val_score(random_forest, normalized_training, y_train, cv = 10, scoring = 'accuracy', n_jobs = -1)
print('KNN Accuracy: {:.5f} (+/- {:.5f})'.format(knn_score.mean(), knn_score.std() * 2))
print('Decision Tree Accuracy: {:.5f} (+/- {:.5f})'.format(decision_tree_score.mean(), decision_tree_score.std() * 2))
print('Random Forest Accuracy: {:.5f} (+/- {:.5f})'.format(cv_random_tree.mean(), cv_random_tree.std() * 2))



KNN Accuracy: 0.01901 (+/- 0.01386)
Decision Tree Accuracy: 0.02240 (+/- 0.00989)
Random Forest Accuracy: 0.02433 (+/- 0.01375)


### Cross Validation with PCA

In [19]:
pca_decision_tree_score = cross_val_score(decision_tree, pca_training, y_train, cv = 10, scoring = 'accuracy', n_jobs = -1)
pca_knn_score = cross_val_score(knn, pca_training, y_train, cv = 10, scoring = 'accuracy', n_jobs = -1)
pca_random_tree = cross_val_score(random_forest, pca_training, y_train, cv = 10, scoring = 'accuracy', n_jobs = -1)
print('KNN Accuracy: {:.5f} (+/- {:.5f})'.format(pca_knn_score.mean(), pca_knn_score.std() * 2))
print('Decision Tree Accuracy: {:.5f} (+/- {:.5f})'.format(pca_decision_tree_score.mean(), pca_decision_tree_score.std() * 2))
print('Randon Forest Accuracy: {:.5f} (+/- {:.5f})'.format(pca_random_tree.mean(), pca_random_tree.std() * 2))



KNN Accuracy: 0.01927 (+/- 0.01599)
Decision Tree Accuracy: 0.01632 (+/- 0.01098)
Randon Forest Accuracy: 0.01847 (+/- 0.01098)


### Logistic Regression without PCA

In [20]:
log_reg = LogisticRegression()
start = time.time()
log_reg.fit(normalized_training, y_train)
end = time.time()
elapsed = end - start
print('Time taken to fit Logistic Regression: {:.5f}'.format(elapsed / 60))

Time taken to fit Logistic Regression: 2.43660


In [21]:
y_predict_lr = log_reg.predict(normalized_testing)
score_lr = accuracy_score(y_test, y_predict_lr)
print('Logistic Regression Score: {:.5f}'.format(score_lr))

log_reg_cross = cross_val_score(log_reg, normalized_training, y_train, cv = 10, scoring = 'accuracy', n_jobs = -1)
print('Logistic Regression Cross Validation Score: {:.5f} (+/- {:.5f})'.format(log_reg_cross.mean(), log_reg_cross.std() * 2))

Logistic Regression Score: 0.04052




Logistic Regression Cross Validation Score: 0.04878 (+/- 0.02543)


### Logistic Regression with PCA

In [22]:
pca_log_reg = LogisticRegression()
start = time.time()
pca_log_reg.fit(pca_training, y_train)
end = time.time()
elapsed = end - start
print('Time taken to fit Logistic Regression: {:.5f}'.format(elapsed /60 ))

Time taken to fit Logistic Regression: 0.17698


In [23]:
pca_predict_lr = pca_log_reg.predict(pca_testing)
pca_score_lr = accuracy_score(y_test, pca_predict_lr)
print('PCA Logistic Regression Score: {:.5f}'.format(pca_score_lr))

pca_log_reg = cross_val_score(pca_log_reg, pca_training, y_train, cv = 10, scoring = 'accuracy', n_jobs = -1)
print('PCA Logistic Regression CV Score: {:.5f}'.format(pca_log_reg.mean()))

PCA Logistic Regression Score: 0.03725




PCA Logistic Regression CV Score: 0.04916
