In [1]:
# Import Libraries
import os
import pandas as pd

from sklearn.model_selection  import RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

from tpot import TPOTRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline



In [19]:
# read in data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [20]:
# drop columns that are not needed
train.drop(['Property_ID'], axis=1, inplace=True)
testPrpId = y = test['Property_ID']
test.drop(['Property_ID'], axis=1, inplace=True)

In [21]:
# split into X and y
y = train['Habitability_score']
train.drop(['Habitability_score'], axis=1, inplace=True)

In [22]:
# Converting string columns to numeric
categorical_features = ['Property_Type', 'Furnishing', 'Frequency_of_Powercuts', 'Power_Backup', 'Water_Supply', 'Crime_Rate', 'Dust_and_Noise']
labelencoder = LabelEncoder()
for feature in categorical_features:
    train[feature] = train[feature].astype(str)
    test[feature] = test[feature].astype(str)

    labelencoder.fit(train[feature])

    train[feature] = labelencoder.transform(train[feature])
    test[feature] = labelencoder.transform(test[feature])

    train[feature].fillna(train[feature].mean(), inplace = True)
    test[feature].fillna(train[feature].mean(), inplace = True)

# impute missing values
numerical_features = ['Property_Area', 'Number_of_Windows', 'Number_of_Doors','Traffic_Density_Score','Air_Quality_Index','Neighborhood_Review']
for feature in numerical_features:
    train[feature].fillna(train[feature].mean(), inplace = True)
    test[feature].fillna(test[feature].mean(), inplace = True)

In [23]:
train.head()

Unnamed: 0,Property_Type,Property_Area,Number_of_Windows,Number_of_Doors,Furnishing,Frequency_of_Powercuts,Power_Backup,Water_Supply,Traffic_Density_Score,Crime_Rate,Dust_and_Noise,Air_Quality_Index,Neighborhood_Review
0,1,106,3.923768,1,1,0,1,3,5.89,1,2,90.0,3.86
1,1,733,2.0,2,2,1,1,2,4.37,3,2,96.0,3.55
2,1,737,4.0,2,0,0,1,3,7.45,1,2,121.0,3.81
3,1,900,3.0,2,2,2,2,3,6.16,2,2,100.0,1.34
4,2,2238,14.0,6,0,0,1,0,5.46,3,2,116.0,4.77


In [24]:
# Unique values in each column
train.nunique()

Property_Type                6
Property_Area             4435
Number_of_Windows           17
Number_of_Doors              6
Furnishing                   4
Frequency_of_Powercuts       5
Power_Backup                 3
Water_Supply                 5
Traffic_Density_Score      772
Crime_Rate                   5
Dust_and_Noise               4
Air_Quality_Index          489
Neighborhood_Review        417
dtype: int64

In [25]:
# pipeline for categorical variables
categorical_features = ['Property_Type', 'Furnishing', 'Frequency_of_Powercuts', 'Power_Backup', 'Water_Supply', 'Crime_Rate', 'Dust_and_Noise']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# pipeline for numerical variables
numerical_features = ['Property_Area', 'Number_of_Windows', 'Number_of_Doors','Traffic_Density_Score','Air_Quality_Index','Neighborhood_Review']
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

In [11]:
# transform full pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

preprocessor.fit(train)
train = preprocessor.transform(train)
test = preprocessor.transform(test)

In [12]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
train = sc.fit_transform(train)
test = sc.transform(test)


In [26]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=42)

In [14]:
# Define models
models = [
    MLPRegressor(hidden_layer_sizes=(10,10,5,3),
     batch_size =16, max_iter=1000, alpha=0.0001,
     solver='adam', verbose=10, tol=0.000000001,
     random_state=42, learning_rate_init=0.001)
]

In [None]:
# Evaluate each model in turn
for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Model: %s" % model.__class__.__name__)
    print("R-squared: %f" % r2_score(y_test, y_pred))
    print("\n")

In [None]:
# define evaluation procedure
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)

# define TPOT Regressor model
tpot = TPOTRegressor(generations=5, population_size=20, verbosity=2, cv=cv, scoring='r2', n_jobs=-1)tpot = TPOTRegressor()
tpot.fit(train, y)

In [None]:
result = tpot.predict(test)

In [None]:
# make final predictions
fina_model = models[0]
final_model.fit(train, y)
final_predictions = final_model.predict(test)

In [None]:
# save results to file
results = pd.DataFrame({'Property_ID': testPrpId, 'Habitability_score': final_predictions})

filename = "submission.csv"

if os.path.exists(filename):
  os.remove(filename)
results.to_csv(filename, index=False,header=True, mode='w')