In [None]:
#objective: create a model to predict when a lot of houses will be put on the market

In [91]:
import pandas as pd
from sklearn import datasets
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
import string
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from sklearn import tree
import matplotlib.pyplot as plt

In [None]:
#keep:
['PARID','PROPERTYHOUSENUM','PROPERTYFRACTION','PROPERTYADDRESS','PROPERTYUNIT','MUNIDESC','OWNERDESC','CLASSDESC',
 'LOTAREA','HOMESTEADFLAG','FARMSTEADFLAG','ABATEMENTFLAG','SALEDATE','SALEPRICE','SALEDESC','PREVSALEDATE','PREVSALEPRICE',
 'PREVSALEDATE2','PREVSALEPRICE2','CHANGENOTICEADDRESS1','CHANGENOTICEADDRESS2','CHANGENOTICEADDRESS3','CHANGENOTICEADDRESS4',
'STYLEDESC','STORIES','YEARBLT','CDUDESC'
]
#maybe:
['TAXDESC','USEDESC','RECORDDATE']

In [None]:
#use property unit as a binary flag?
#use classdesc as a filter
#take out all farmsteads
#find out what recorddate,deedbook,deedpage,countybuilding,countyland,localbuilding-fairmarkettotal,gradedesc is
#ask if specific properties of house are important

In [73]:
#read in data
sales=pd.read_csv(r'C:\Users\Tara\OneDrive - University of Pittsburgh\FALL 2022\ENGR 1171\Project Housing Data\SalesData.csv')
#convert sale date from string to date
sales['SALEDATE']=pd.to_datetime(sales['SALEDATE'])
assessment=pd.read_csv(r'C:\Users\Tara\OneDrive - University of Pittsburgh\FALL 2022\ENGR 1171\Project Housing Data\AssessmentData.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [74]:
#add flag to assessment data of whether a house is in the sales dataset
assessment['Sold']=''
assessment.loc[assessment['PARID'].isin(sales['PARID']),'Sold']=1
assessment.loc[~assessment['PARID'].isin(sales['PARID']),'Sold']=0

In [75]:
#isolate most recent sale date for each house sold, merge onto assessment dataset
salesgrouped=sales[['PARID','SALEDATE']].groupby('PARID').agg({'SALEDATE':'max'}).reset_index().rename(columns={'SALEDATE':'FINALSALEDATE'})
df=pd.merge(assessment,salesgrouped,how='left',on='PARID')

In [77]:
#limit data to residential properties
df=df.loc[df['CLASSDESC']=='RESIDENTIAL']

#drop redundant columns (e.g. codes that have matching descriptions)
df=df.drop(['PROPERTYCITY','PROPERTYSTATE', 'PROPERTYZIP', 'MUNICODE', 'SCHOOLCODE', 'SCHOOLDESC', 'NEIGHCODE','NEIGHDESC', 'TAXCODE', 'TAXSUBCODE', 'TAXSUBCODE_DESC', 'OWNERCODE','CLASS','USECODE','CLEANGREEN', 'SALECODE','DEEDBOOK', 'DEEDPAGE','EXTERIORFINISH', 'EXTFINISH_DESC', 'ROOF', 'ROOFDESC', 'BASEMENT','BASEMENTDESC','CONDITION', 'CONDITIONDESC','HEATINGCOOLING', 'HEATINGCOOLINGDESC', 'FIREPLACES', 'BSMTGARAGE', 'FINISHEDLIVINGAREA', 'CARDNUMBER', 'ALT_ID', 'TAXYEAR', 'ASOFDATE'],axis=1)

In [78]:
#add number of times property was sold
df=pd.merge(df,sales.groupby('PARID').size().reset_index(name='NumTimesSold'),how='left',on='PARID')
df['NumTimesSold']=df['NumTimesSold'].fillna(0)

In [80]:
#identify possible investor-owned properties
df['PossibleInvestor']=''
df.loc[(df['CHANGENOTICEADDRESS1'].str.replace(" ", "")!=(df['PROPERTYHOUSENUM'].astype(str)+df['PROPERTYADDRESS']).str.replace(" ", ""))&(df['HOMESTEADFLAG'].isna()),'PossibleInvestor']=1
df.loc[df['PossibleInvestor']!=1,'PossibleInvestor']=0

In [81]:
#add multi-parcel sale flag
df['MultiParcel']=''
df.loc[df['PARID'].isin(sales.loc[sales['SALEDESC']=='MULTI-PARCEL SALE']['PARID']),'MultiParcel']=1
df.loc[df['MultiParcel']!=1,'MultiParcel']=0

In [82]:
#get a list of all unique property owners
#make a unique id for each property owner, add it to the main data
df['Ownerinfo']=df['CHANGENOTICEADDRESS1'].astype(str)+df['CHANGENOTICEADDRESS2'].astype(str)+df['CHANGENOTICEADDRESS3'].astype(str)+df['CHANGENOTICEADDRESS4'].astype(str)
allowners=df[['Ownerinfo']].drop_duplicates().reset_index()
allowners['OwnerId']=allowners.index
allowners=allowners[['Ownerinfo','OwnerId']]
df=pd.merge(df,allowners,how='left',on='Ownerinfo')

In [86]:
#get the number of times each owner has sold a property, merge onto original dataset
#only keep data of owners that have a sale in the dataset
ownerssold=df.groupby('OwnerId').agg({'Sold':'sum'}).reset_index().rename(columns={'Sold':'TotalOwnerSales'})
df=pd.merge(df,ownerssold,how='left',on='OwnerId')
df=df.loc[df['TotalOwnerSales']>0]

In [88]:
#pick out some interesting columns
df=df[['PARID','MUNIDESC','OWNERDESC','CLASSDESC',
 'SALEDATE','SALEPRICE','SALEDESC','PREVSALEDATE','PREVSALEPRICE',
 'PREVSALEDATE2','PREVSALEPRICE2',
'STYLEDESC','YEARBLT','GRADEDESC','Sold','FINALSALEDATE','NumTimesSold','TotalOwnerSales','OwnerId','PossibleInvestor','MultiParcel'
]]

Unnamed: 0_level_0,FINALSALEDATE
OwnerId,Unnamed: 1_level_1
0,2021-11-10
2,2020-12-15
3,2016-04-13
4,2014-09-12
5,2017-04-26
...,...
404679,2019-10-22
404680,2018-06-13
404682,2022-06-17
404683,2012-04-05


In [111]:
#identify 4579 owners that may be of interest
xx=df.loc[(df['OWNERDESC']!='CORPORATION')&(df['TotalOwnerSales']>2)&(df['SALEDESC']!='LOVE&AFFECTION')][['OwnerId','TotalOwnerSales']].drop_duplicates().sort_values('TotalOwnerSales')
yy=df.groupby('OwnerId').agg({'FINALSALEDATE':'max'}).reset_index()
pd.merge(xx,yy,how='left',on='OwnerId').sort_values(['TotalOwnerSales','FINALSALEDATE'])

Unnamed: 0,OwnerId,TotalOwnerSales,FINALSALEDATE
2269,4396,3,2012-01-12
1984,32333,3,2012-01-27
1657,36926,3,2012-02-10
426,60351,3,2012-02-17
1819,38601,3,2012-02-27
...,...,...,...
4574,1957,231,2022-09-07
4575,69680,237,2022-08-16
4576,28659,374,2022-06-14
4577,3358,413,2022-09-01


In [106]:
#the same owner has many different owner descriptions??
df.loc[df['OwnerId']==180]['OWNERDESC'].drop_duplicates()

254                 CORPORATION
1033                    REGULAR
3078               REGULAR-ETAL
5076     REGULAR-ETUX OR ET VIR
14566        REGULAR-UNFINISHED
42648      CORPORATION-RAILROAD
Name: OWNERDESC, dtype: object

In [None]:
#next steps: assign code to indicate recency of sale - how to discount exponentially w time?
#use feature selection to get features, then test out assorted models

In [None]:
'''Investor-Owned Properties
It may be possible to identify investor-owned residential properties using several fields in the
assessment data. Given that not all eligible owners apply, relying solely on the lack of a
Homestead flag does not provide enough certainty when working to identify investors. To begin,
look at all residential properties using the “CLASSDESC” field to limit the search to privately
owned properties (please note that condominium units are classified as residential, but
apartment buildings with five or more units, or mixed-use residential parcels are categorized as
“commercial” in the “CLASSDESC” field). Then, for all properties without a Homestead flag
(“HOMESTEAD”<> “HOM”), compare the property address and owner’s address (using the four
“CHANGENOTICE” fields as a proxy for owner’s address). If the addresses do not match and
there is no Homestead exemption, this may provide an indication that the residential properties
may be investor-owned. This data can be used to get a sense of neighborhood housing market
dynamics and to develop informed investor housing strategies.
For a guide to developing an investor housing strategy, see:
http://www.policylink.org/find-resources/library/when-investors-buy-up-the-neighborhood
'''

In [None]:
'''Track Common Ownership Across Multiple Properties
Owners of multiple properties can sometimes be linked by a common owner’s address (using
the four “CHANGENOTICE” fields as a proxy for owner’s address) across their various holdings,
even if the owner’s names appear differently in the assessment data for some of the properties
in their portfolio. This data has been useful in proactive code enforcement, as data was recently
used to identify the potential holdings of an investor that was cited for not addressing serious
condition issues on a rental property in Carrick. In the assessment data, properties held at least
in part by this owner were listed under several different corporate names but had a common
owner’s address in the assessment record. The owner of the problem property mentioned in the
news article was listed as an officer with one of these companies according to incorporations
data from the Pennsylvania Department of State.
Please use caution with this technique, as some properties may list the ”CHANGENOTICE”
address as either the property manager or the mortgage company, not the property owner. It is
best used on properties not managed by a third party, and owned free and clear'''

#https://data.wprdc.org/dataset/2b3df818-601e-4f06-b150-643557229491/resource/cc4bafd2-25b6-41d7-83aa-d16bc211b020/download/alleghenycountypropertyassessmentdatauserguide-4.pdf