In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
def load_data(csv_path):
    return pd.read_csv(csv_path)

In [3]:
def rmsle(actual, predicted):
    """
    Args:
        actual (1d-array) - array of actual values (float)
        predicted (1d-array) - array of predicted values (float)
    Returns:
        root mean square log error (float)
    """
    return np.sqrt(np.mean(np.power(np.log1p(actual)-np.log1p(predicted), 2)))

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [5]:
TRAIN_PATH = "/home/agi/Desktop/NOMAD/data/train_prepared.csv"
TEST_PATH = "/home/agi/Desktop/NOMAD/data/test_prepared.csv"

trainData = load_data(TRAIN_PATH)
testData = load_data(TEST_PATH)

# Drop duplicates by id
dups = [394,125,1214,1885,2074,352,307,2153,530,1378,2318,2336,2369,2332]
trainData = trainData.drop(trainData.index[[dups]])

In [6]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(trainData, test_size=0.2, random_state=42)

train = train_set.copy()
train = train.drop('id', 1)
train = train.drop("formation_energy_ev_natom", 1)
train = train.drop("bandgap_energy_ev", 1)
form_labels = trainData["formation_energy_ev_natom"].copy()
band_labels = trainData["bandgap_energy_ev"].copy()

In [7]:
numerical_attrbs = list(train)
del numerical_attrbs[0]
del numerical_attrbs[0]
label_attrbs = ['spacegroup']

In [8]:
from sklearn.preprocessing import OneHotEncoder
spacegroup = trainData["spacegroup"]
encoder = OneHotEncoder()
spacegroup_1hot = encoder.fit_transform(spacegroup.values.reshape(-1,1))


In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion

attrs = list(trainData)
hot_attrs = ["spacegroup"]

pl1 = Pipeline([
    ('selector', DataFrameSelector(numerical_attrbs)),
    ('scaler', StandardScaler()),
])

pl2 = Pipeline([
    ('selector', DataFrameSelector(label_attrbs)),
    ('one_hot', OneHotEncoder())
])

full_pl = FeatureUnion(transformer_list=[
    ('numerical', pl1),
    ('label',     pl2),
])

In [10]:
data_prepared = full_pl.fit_transform(trainData)
print(data_prepared)

  (0, 0)	0.899877849416
  (0, 1)	0.282932360262
  (0, 2)	-1.16260359879
  (0, 3)	-0.00993688368141
  (0, 4)	0.773641791316
  (0, 5)	-0.629384126672
  (0, 6)	-0.186429361804
  (0, 7)	-0.451480711486
  (0, 8)	-0.187092902296
  (0, 9)	1.17191310414
  (0, 10)	0.0996530705994
  (0, 11)	0.592597796084
  (0, 12)	-1.15259582661
  (0, 13)	1.01052386234
  (0, 14)	-0.957122970386
  (0, 15)	0.625540595187
  (0, 16)	-0.793083600633
  (0, 17)	-1.12595411239
  (0, 18)	-0.186429361804
  (0, 19)	-0.451480711486
  (0, 20)	-0.187092902296
  (0, 21)	0.371897158821
  (0, 22)	1.14248979175
  (0, 24)	1.0
  (1, 0)	0.899877849416
  :	:
  (2384, 24)	1.0
  (2385, 0)	1.84044084306
  (2385, 1)	-0.921045454063
  (2385, 2)	-1.04377821172
  (2385, 3)	-0.167610858241
  (2385, 4)	1.04501304918
  (2385, 5)	-0.650137825425
  (2385, 6)	-0.186353956851
  (2385, 7)	-0.45121482119
  (2385, 8)	-0.187208908317
  (2385, 9)	1.51652343362
  (2385, 10)	1.38220019958
  (2385, 11)	1.76915507392
  (2385, 12)	-1.59839531811
  (2385, 1

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor 

lin_reg_form = GradientBoostingRegressor(n_estimators=10000)
lin_reg_band = GradientBoostingRegressor(n_estimators=10000)

lin_reg_form.fit(data_prepared, form_labels)
lin_reg_band.fit(data_prepared, band_labels)

In [None]:
from sklearn.metrics import mean_squared_error
predictions_form = lin_reg_form.predict(data_prepared)
predictions_band = lin_reg_band.predict(data_prepared)
lin_mse_form = mean_squared_error(form_labels, predictions_form)
lin_mse_band = mean_squared_error(band_labels, predictions_band)
lin_rmse_form = np.sqrt(lin_mse_form)
lin_rmse_band = np.sqrt(lin_mse_band)

print(lin_rmse_form)
print(lin_rmse_band)

In [None]:
#feature_importances = lin_reg_form.feature_importances_

#cat_one_hot_attribs = list('1')
#print(encoder.categorical_features)
#attributes = numerical_attrbs + cat_one_hot_attribs
#sorted(zip(feature_importances, attributes), reverse=True)

In [None]:
X_test = test_set.drop("id", axis=1).drop("formation_energy_ev_natom", axis=1).drop("bandgap_energy_ev", axis=1)
form_test = test_set["formation_energy_ev_natom"]
X_test_prepared = full_pl.transform(X_test)
final_predictions = lin_reg_form.predict(X_test_prepared)

final_mse = mean_squared_error(form_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

In [None]:
X_test = test_set.drop("id", axis=1).drop("formation_energy_ev_natom", axis=1).drop("bandgap_energy_ev", axis=1)
y_test = test_set["bandgap_energy_ev"]
X_test_prepared = full_pl.transform(X_test)
final_predictions = lin_reg_band.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

In [None]:
test = testData.copy()
test.drop('id', 1)
test_prepared = full_pl.transform(test)
test_predictions_form = lin_reg_form.predict(test_prepared)
test_predictions_band = lin_reg_band.predict(test_prepared)

In [None]:
submission = np.concatenate((test_predictions_form.reshape(600,1), 
                        test_predictions_band.reshape(600,1)), axis=1)
print(submission)

In [None]:
df = pd.DataFrame(submission, columns=['formation_energy_ev_natom', "bandgap_energy_ev"])
df[df < 0] = 0

In [None]:
df.insert(0, 'id', range(1, len(df) + 1))

In [None]:
df.to_csv("/home/agi/Desktop/NOMAD/submit_10kgb.csv", index=False)