In [33]:
import os
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LassoLars
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import TheilSenRegressor
from sklearn.linear_model import HuberRegressor

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [34]:
# read in data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [35]:
# drop columns that are not needed
train.drop(['Property_ID'], axis=1, inplace=True)
testPrpId = y = test['Property_ID']
test.drop(['Property_ID'], axis=1, inplace=True)

In [36]:
# split into X and y
y = train['Habitability_score']
train.drop(['Habitability_score'], axis=1, inplace=True)

In [37]:
# train.Neighborhood_Review = train.Neighborhood_Review.round()
# test.Neighborhood_Review = test.Neighborhood_Review.round()

In [38]:
# Converting string columns to numeric
categorical_features = ['Property_Type', 'Furnishing', 'Frequency_of_Powercuts', 'Power_Backup', 'Water_Supply', 'Crime_Rate', 'Dust_and_Noise']
labelencoder = LabelEncoder()
for feature in categorical_features:
    train[feature] = labelencoder.fit_transform(train[feature])
    train[feature].fillna(train[feature].mean(), inplace = True)
    test[feature] = labelencoder.fit_transform(test[feature])
    test[feature].fillna(test[feature].mean(), inplace = True)

# impute missing values
numerical_features = ['Property_Area', 'Number_of_Windows', 'Number_of_Doors','Traffic_Density_Score','Air_Quality_Index','Neighborhood_Review']
for feature in numerical_features:
    train[feature].fillna(train[feature].mean(), inplace = True)
    test[feature].fillna(test[feature].mean(), inplace = True)

In [39]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39499 entries, 0 to 39498
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Property_Type           39499 non-null  int32  
 1   Property_Area           39499 non-null  int64  
 2   Number_of_Windows       39499 non-null  float64
 3   Number_of_Doors         39499 non-null  int64  
 4   Furnishing              39499 non-null  int32  
 5   Frequency_of_Powercuts  39499 non-null  int64  
 6   Power_Backup            39499 non-null  int32  
 7   Water_Supply            39499 non-null  int32  
 8   Traffic_Density_Score   39499 non-null  float64
 9   Crime_Rate              39499 non-null  int32  
 10  Dust_and_Noise          39499 non-null  int32  
 11  Air_Quality_Index       39499 non-null  float64
 12  Neighborhood_Review     39499 non-null  float64
dtypes: float64(4), int32(6), int64(3)
memory usage: 3.0 MB


In [42]:
# create pipeline for categorical variables
categorical_features = ['Property_Type', 'Furnishing', 'Frequency_of_Powercuts', 'Power_Backup', 'Water_Supply', 'Crime_Rate', 'Dust_and_Noise']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [43]:
# create pipeline for numerical variables
numerical_features = ['Property_Area', 'Number_of_Windows', 'Number_of_Doors','Traffic_Density_Score','Air_Quality_Index','Neighborhood_Review']
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

In [44]:
# create full pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])
# Use the full pipeline to transform both training and test sets
preprocessor.fit(train)
train = preprocessor.transform(train)
test = preprocessor.transform(test)


In [45]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=42)

In [46]:
# Define the list models to evaluate
models = [
    RandomForestRegressor(),
    XGBRegressor()]

In [47]:
# Evaluate each model in turn
for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Model: %s" % model.__class__.__name__)
    print("R-squared: %f" % r2_score(y_test, y_pred))
    print("\n")

Model: RandomForestRegressor
R-squared: 0.821791


Model: XGBRegressor
R-squared: 0.807864




In [48]:
# Final model
final_model = models[0]

In [49]:
# make final predictions
final_model.fit(train, y)
final_predictions = final_model.predict(test)

In [51]:
# save results to file
results = pd.DataFrame({'Property_ID': testPrpId, 'Habitability_score': final_predictions})

filename = "submission.csv"

if os.path.exists(filename):
  os.remove(filename)
results.to_csv(filename, index=False,header=True, mode='w')