In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,Year,Date,Locality,Address,Estimated Value,Sale Price,Property,Residential,num_rooms,carpet_area,property_tax_rate
0,2009,2009-01-02,Greenwich,40 ETTL LN UT 24,711270.0,975000.0,Condo,Condominium,2,760,1.025953
1,2009,2009-01-02,East Hampton,18 BAUER RD,119970.0,189900.0,Single Family,Detached House,3,921,1.025953
2,2009,2009-01-02,Ridgefield,48 HIGH VALLEY RD.,494530.0,825000.0,Single Family,Detached House,3,982,1.025953
3,2009,2009-01-02,Old Lyme,56 MERIDEN RD,197600.0,450000.0,Single Family,Detached House,3,976,1.025953
4,2009,2009-01-02,Naugatuck,13 CELENTANO DR,105440.0,200000.0,Single Family,Detached House,3,947,1.025953


In [3]:
train.shape

(553952, 11)

In [4]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,Year,Date,Locality,Address,Estimated Value,Sale Price,Property,Residential,num_rooms,carpet_area,property_tax_rate,Segment
0,2023,2023-01-01,Old Lyme,12 SWAN AVE,151400.0,0,Residential,Detached House,3,947.0,1.46,0
1,2023,2023-01-01,Ridgefield,59 LINCOLN LANE,686900.0,0,Residential,Detached House,3,1051.0,1.46,0
2,2023,2023-01-04,Cromwell,6 GROVE RD,152030.0,0,Residential,Detached House,3,925.0,1.46,0
3,2023,2023-01-04,New Haven,346 CONCORD ST,156130.0,0,Residential,Duplex,4,1210.0,1.46,0
4,2023,2023-01-04,Beacon Falls,14 LASKY ROAD,108970.0,0,Residential,Detached House,3,1089.0,1.46,0


In [5]:
test.shape

(43954, 12)

In [6]:
newtrain = train.copy()
newtest = test.copy()

In [7]:
segment = []
for i in range(len(newtrain)):
    segment.append(np.abs(newtrain['Sale Price'][i] - newtrain['Estimated Value'][i])/100)


In [8]:
q1, q2, q3 = np.percentile(segment, [25, 50, 75])
print(q1, q2, q3)

381.8 754.8 1301.0


In [9]:
con_seg = []
for ind in range(len(segment)):
    if segment[ind] >= 0 and segment[ind] < q1:
        con_seg.append(3)
    if segment[ind] >= q1 and segment[ind] < q2:
        con_seg.append(2)
    if segment[ind] >= q2 and segment[ind] < q3:
        con_seg.append(1)
    if segment[ind] >= q3:
        con_seg.append(0)

In [10]:
y_train = newtrain['Sale Price']
newtrain.drop(['Sale Price','Year','Date','Address'] ,axis = 1,inplace = True)

In [11]:
newtest.drop(['Sale Price', 'Segment', 'Year', 'Date' ,'Address'] , axis = 1,inplace = True)

In [12]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [13]:
x = ['Locality', 'Property', 'Residential', 'num_rooms']
for i in x:
    newtrain[i] = le.fit_transform(newtrain[i])
    newtest[i] = np.where(newtest[i].isin(le.classes_), le.transform(newtest[i]), -1)

In [14]:
newtrain['carpet_area'] = pd.cut(newtrain['carpet_area'], bins = 5, labels=[1, 2, 3, 4, 5])

In [15]:
newtest['carpet_area'] = pd.cut(newtest['carpet_area'], bins = 5, labels=[1, 2, 3, 4, 5])

In [16]:
newtrain['property_tax_rate'] = pd.cut(newtrain['property_tax_rate'], bins = 5, labels= [1, 2, 3, 4, 5])

In [17]:
newtest['property_tax_rate'] = pd.cut(newtest['property_tax_rate'], bins = 5, labels= [1, 2, 3, 4, 5])

In [18]:
newtrain.head()

Unnamed: 0,Locality,Estimated Value,Property,Residential,num_rooms,carpet_area,property_tax_rate
0,57,711270.0,0,0,0,1,1
1,41,119970.0,3,1,1,1,1
2,118,494530.0,3,1,1,1,1
3,105,197600.0,3,1,1,1,1
4,88,105440.0,3,1,1,1,1


In [19]:
newtest.head()

Unnamed: 0,Locality,Estimated Value,Property,Residential,num_rooms,carpet_area,property_tax_rate
0,105,151400.0,2,1,1,1,3
1,118,686900.0,2,1,1,1,3
2,33,152030.0,2,1,1,1,3
3,93,156130.0,2,2,2,2,3
4,6,108970.0,2,1,1,2,3


In [20]:
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
y_pred = rf.fit(newtrain, y_train).predict(newtest)

In [21]:
test_gain = []
for i in range(len(newtest)):
    test_gain.append((y_pred[i] - newtest['Estimated Value'][i])/100)
len(test_gain)


43954

In [22]:
seg = []
for ind in range(len(test_gain)):
    if test_gain[ind] >= 0 and test_gain[ind] < q1:
        seg.append(3)
    elif test_gain[ind]>=q1 and test_gain[ind]< q2:
        seg.append(2)
    elif test_gain[ind]>=q2 and test_gain[ind]< q3:
        seg.append(1)
    elif test_gain[ind]>=q3:
        seg.append(0)
    else:
        seg.append(0)

In [23]:
result = pd.DataFrame({'Segment': seg})
result.to_csv('Solution.csv', index = False)