In [2]:
# Import Libraries
import os
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

from sklearn.neural_network import MLPRegressor

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [3]:
# read in data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
# drop columns that are not needed
train.drop(['Property_ID'], axis=1, inplace=True)
testPrpId = y = test['Property_ID']
test.drop(['Property_ID'], axis=1, inplace=True)

In [5]:
# split into X and y
y = train['Habitability_score']
train.drop(['Habitability_score'], axis=1, inplace=True)

In [6]:
# Converting string columns to numeric
categorical_features = ['Property_Type', 'Furnishing', 'Frequency_of_Powercuts', 'Power_Backup', 'Water_Supply', 'Crime_Rate', 'Dust_and_Noise']
labelencoder = LabelEncoder()
for feature in categorical_features:
    train[feature] = train[feature].astype(str)
    test[feature] = test[feature].astype(str)

    labelencoder.fit(train[feature])

    train[feature] = labelencoder.transform(train[feature])
    test[feature] = labelencoder.transform(test[feature])

    train[feature].fillna(train[feature].mean(), inplace = True)
    test[feature].fillna(train[feature].mean(), inplace = True)

# impute missing values
numerical_features = ['Property_Area', 'Number_of_Windows', 'Number_of_Doors','Traffic_Density_Score','Air_Quality_Index','Neighborhood_Review']
for feature in numerical_features:
    train[feature].fillna(train[feature].mean(), inplace = True)
    test[feature].fillna(test[feature].mean(), inplace = True)

In [7]:
train.head()

Unnamed: 0,Property_Type,Property_Area,Number_of_Windows,Number_of_Doors,Furnishing,Frequency_of_Powercuts,Power_Backup,Water_Supply,Traffic_Density_Score,Crime_Rate,Dust_and_Noise,Air_Quality_Index,Neighborhood_Review
0,1,106,3.923768,1,1,0,1,3,5.89,1,2,90.0,3.86
1,1,733,2.0,2,2,1,1,2,4.37,3,2,96.0,3.55
2,1,737,4.0,2,0,0,1,3,7.45,1,2,121.0,3.81
3,1,900,3.0,2,2,2,2,3,6.16,2,2,100.0,1.34
4,2,2238,14.0,6,0,0,1,0,5.46,3,2,116.0,4.77


In [9]:
# Unique values in each column
train.nunique()

Property_Type                6
Property_Area             4435
Number_of_Windows           17
Number_of_Doors              6
Furnishing                   4
Frequency_of_Powercuts       5
Power_Backup                 3
Water_Supply                 5
Traffic_Density_Score      772
Crime_Rate                   5
Dust_and_Noise               4
Air_Quality_Index          489
Neighborhood_Review        417
dtype: int64

In [10]:
# pipeline for categorical variables
categorical_features = ['Property_Type', 'Furnishing', 'Frequency_of_Powercuts', 'Power_Backup', 'Water_Supply', 'Crime_Rate', 'Dust_and_Noise']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# pipeline for numerical variables
numerical_features = ['Property_Area', 'Number_of_Windows', 'Number_of_Doors','Traffic_Density_Score','Air_Quality_Index','Neighborhood_Review']
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

In [11]:
# transform full pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

preprocessor.fit(train)
train = preprocessor.transform(train)
test = preprocessor.transform(test)

In [12]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
train = sc.fit_transform(train)
test = sc.transform(test)


In [13]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=42)

In [20]:
# Define models
models = [
    MLPRegressor(hidden_layer_sizes=(10,10,5,3),
     batch_size =16, max_iter=1000, alpha=0.0001,
     solver='adam', verbose=10, tol=0.000000001,
     random_state=42, learning_rate_init=0.001)
]

In [None]:
# Evaluate each model in turn
for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Model: %s" % model.__class__.__name__)
    print("R-squared: %f" % r2_score(y_test, y_pred))
    print("\n")

In [None]:
# MLPRegressor Tuning Parameters
from sklearn.model_selection import GridSearchCV
parameters = {
    'hidden_layer_sizes': [(10, 10, 3), (10, 10, 5, 3), (10, 10, 10, 5, 3)],
    'max_iter': [100, 200, 500],
    'alpha': [0.0001, 0.00001, 0.000001],
    'solver': ['adam'],
    'verbose': [10],
    'tol': [0.000000001],
    'random_state': [42],
    'learning_rate_init': [0.001]
}

grid_search = GridSearchCV(MLPRegressor(), parameters, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

In [30]:
# Final model
final_model = MLPRegressor(hidden_layer_sizes=(10,10,3,),
                        batch_size =64, max_iter=1000, 
                        alpha=0.0001, solver='adam',
                        verbose=10, tol=0.000000001,
                        random_state=42, learning_rate_init=0.001)


In [29]:
# make final predictions
final_model.fit(train, y)
final_predictions = final_model.predict(test)

Iteration 1, loss = 781.55775643
Iteration 2, loss = 78.40772235
Iteration 3, loss = 59.04660687
Iteration 4, loss = 48.34771579
Iteration 5, loss = 42.07876031
Iteration 6, loss = 38.07736835
Iteration 7, loss = 34.95038514
Iteration 8, loss = 32.61459311
Iteration 9, loss = 30.82848419
Iteration 10, loss = 29.24995556
Iteration 11, loss = 27.92305152
Iteration 12, loss = 26.92262093
Iteration 13, loss = 26.14238046
Iteration 14, loss = 25.55270525
Iteration 15, loss = 25.06299340
Iteration 16, loss = 24.70815809
Iteration 17, loss = 24.39564046
Iteration 18, loss = 24.18450050
Iteration 19, loss = 23.98783572
Iteration 20, loss = 23.84412882
Iteration 21, loss = 23.74461229
Iteration 22, loss = 23.65841592
Iteration 23, loss = 23.49954758
Iteration 24, loss = 23.36675074
Iteration 25, loss = 23.28665083
Iteration 26, loss = 23.21500162
Iteration 27, loss = 23.11078132
Iteration 28, loss = 23.03799981
Iteration 29, loss = 23.02842031
Iteration 30, loss = 22.94646766
Iteration 31, loss

In [27]:
# save results to file
results = pd.DataFrame({'Property_ID': testPrpId, 'Habitability_score': final_predictions})

filename = "submission.csv"

if os.path.exists(filename):
  os.remove(filename)
results.to_csv(filename, index=False,header=True, mode='w')