In [32]:
# Import Libraries
import os
import pandas as pd

from sklearn.model_selection  import RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [33]:
# read in data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [34]:
# drop columns that are not needed
train.drop(['Property_ID'], axis=1, inplace=True)
testPrpId = y = test['Property_ID']
test.drop(['Property_ID'], axis=1, inplace=True)

In [35]:
# remove rows with missing values
train.dropna(inplace=True)

In [36]:
# split into X and y
y = train['Habitability_score']
train.drop(['Habitability_score'], axis=1, inplace=True)

In [None]:
# Converting string columns to numeric
categorical_features = ['Property_Type', 'Furnishing', 'Frequency_of_Powercuts', 'Power_Backup', 'Water_Supply', 'Crime_Rate', 'Dust_and_Noise']
labelencoder = LabelEncoder()
for feature in categorical_features:
    train[feature] = train[feature].astype(str)
    test[feature] = test[feature].astype(str)
    

    labelencoder.fit(train[feature].append(test[feature]))

    train[feature] = labelencoder.transform(train[feature])
    test[feature] = labelencoder.transform(test[feature]) 


    test[feature].fillna(train[feature].mean(), inplace=True)

# impute missing values
numerical_features = ['Property_Area', 'Number_of_Windows', 'Number_of_Doors','Traffic_Density_Score','Air_Quality_Index','Neighborhood_Review']
for feature in numerical_features:
    test[feature].fillna(train[feature].mean(), inplace = True)

In [41]:
# pipeline for categorical variables
categorical_features = ['Property_Type', 'Furnishing', 'Frequency_of_Powercuts', 'Power_Backup', 'Water_Supply', 'Crime_Rate', 'Dust_and_Noise']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# pipeline for numerical variables
numerical_features = ['Property_Area', 'Number_of_Windows', 'Number_of_Doors','Traffic_Density_Score','Air_Quality_Index','Neighborhood_Review']
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

In [42]:
# transform full pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

preprocessor.fit(train)
train = preprocessor.transform(train)
test = preprocessor.transform(test)

In [43]:
# Feature Scaling using StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train = scaler.fit_transform(train)
test = scaler.transform(test)

In [44]:
# # split into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=42)

In [59]:
# Stacking Regressor models
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV

estimators = [
    ('rf', RandomForestRegressor()),
    ('et', ExtraTreesRegressor()),
    ('knn', KNeighborsRegressor()),
    ('gbrt', GradientBoostingRegressor())
]

model = StackingRegressor(estimators=estimators, final_estimator=RidgeCV())
model.fit(train, y)

In [104]:
final_predictions = model.predict(test)

In [105]:
# save results to file
results = pd.DataFrame({'Property_ID': testPrpId, 'Habitability_score': final_predictions})

filename = "submission.csv"

if os.path.exists(filename):
  os.remove(filename)
results.to_csv(filename, index=False,header=True, mode='w')