In [20]:
# Import dependencies
import pandas as pd
from pathlib import Path
import hvplot.pandas

In [21]:
# Read in the CSV file as a Pandas DataFrame
raot_cta_ner = pd.read_csv(
    Path("../Resources/data.csv")
)

raot_cta_ner['WEEK_START'] = pd.to_datetime(raot_cta_ner['WEEK_START']).dt.date
raot_cta_ner = raot_cta_ner.sort_values(by='WEEK_START')

# select rows where date is between given start/end date (inclusive)
start_date = pd.to_datetime('10/10/2022')
end_date = pd.to_datetime('10/16/2022')

raot_cta_ner = raot_cta_ner.loc[
    (raot_cta_ner['WEEK_START'] >= start_date) & (raot_cta_ner['WEEK_START'] <= end_date)]


# Review the DataFrame
raot_cta_ner.head()

Unnamed: 0,WEEK_START,ID,NAME,MSA,YEAR_BUILT,NER_T4,CONSTRUCTION_TYPE
95115,2022-10-10,5e7e7f86573c70002fcd6196,Christopher Todd Communities At Country Place,"Phoenix, AZ",2018,1711.277055,BTR/SFR
21331,2022-10-10,56ca8f47b8fc2f0e00b8016c,The Palisades at Paradise Valley Mall,"Phoenix, AZ",1990,1776.89832,Garden
83962,2022-10-10,5a04dd9ddd8418001d408527,Peak 16,"Phoenix, AZ",2017,1867.079398,Mid-rise
38688,2022-10-10,5778281acc8a0d0e00a53c9c,San Portella Apartments,"Phoenix, AZ",2008,2011.178571,Garden
44510,2022-10-10,58dd528686f8a8000b02c124,Park Tower Apartments,"Phoenix, AZ",1986,1534.266666,Garden


In [22]:
raot_cta_ner = raot_cta_ner.drop_duplicates(subset=['ID'], keep='last')
raot_cta_ner = raot_cta_ner.dropna()
#raot_cta_ner = raot_cta_ner[raot_cta_ner['CONSTRUCTION_TYPE'] == 'Garden']
print(raot_cta_ner['ID'].nunique())

602


In [23]:
raot_cta_ner['CONSTRUCTION_TYPE'].value_counts()
#raot_cta_ner = raot_cta_ner[raot_cta_ner['CONSTRUCTION_TYPE'] == 'Garden']


Garden       490
Mid-rise      89
BTR/SFR        7
Platform       6
High-rise      6
Wrap           4
Name: CONSTRUCTION_TYPE, dtype: int64

In [24]:
raot_cta_ner = pd.get_dummies(raot_cta_ner,columns=['CONSTRUCTION_TYPE'],drop_first=True).reset_index(drop=True)
raot_cta_ner.head()

Unnamed: 0,WEEK_START,ID,NAME,MSA,YEAR_BUILT,NER_T4,CONSTRUCTION_TYPE_Garden,CONSTRUCTION_TYPE_High-rise,CONSTRUCTION_TYPE_Mid-rise,CONSTRUCTION_TYPE_Platform,CONSTRUCTION_TYPE_Wrap
0,2022-10-10,5e7e7f86573c70002fcd6196,Christopher Todd Communities At Country Place,"Phoenix, AZ",2018,1711.277055,0,0,0,0,0
1,2022-10-10,56ca8f47b8fc2f0e00b8016c,The Palisades at Paradise Valley Mall,"Phoenix, AZ",1990,1776.89832,1,0,0,0,0
2,2022-10-10,5a04dd9ddd8418001d408527,Peak 16,"Phoenix, AZ",2017,1867.079398,0,0,1,0,0
3,2022-10-10,5778281acc8a0d0e00a53c9c,San Portella Apartments,"Phoenix, AZ",2008,2011.178571,1,0,0,0,0
4,2022-10-10,58dd528686f8a8000b02c124,Park Tower Apartments,"Phoenix, AZ",1986,1534.266666,1,0,0,0,0


In [25]:
raot_cta_ner["NER_T4"]. corr(raot_cta_ner["YEAR_BUILT"])

0.6319003110862028

In [26]:
# Visualize a scatter plot of the data
raot_cta_ner.hvplot.scatter(x="YEAR_BUILT", y="NER_T4")

## Run the k-means model with 3 clusters

In [27]:
# Start by importing the K-means algorithm
from sklearn.cluster import KMeans

In [28]:
# Create and initialize the K-means model instance for 3 clusters
model = KMeans(n_clusters=3, random_state=1)

# Print the model
model

In [29]:
raot_cta_ner.columns

Index(['WEEK_START', 'ID', 'NAME', 'MSA', 'YEAR_BUILT', 'NER_T4',
       'CONSTRUCTION_TYPE_Garden', 'CONSTRUCTION_TYPE_High-rise',
       'CONSTRUCTION_TYPE_Mid-rise', 'CONSTRUCTION_TYPE_Platform',
       'CONSTRUCTION_TYPE_Wrap'],
      dtype='object')

In [30]:
# Fit the data to the instance of the model
#year_and_ner = raot_cta_ner[["YEAR_BUILT", "NER_T4", 'CONSTRUCTION_TYPE_Garden',
#                        'CONSTRUCTION_TYPE_High-rise', 'CONSTRUCTION_TYPE_Mid-rise', 
#                        'CONSTRUCTION_TYPE_Platform', 'CONSTRUCTION_TYPE_Wrap']]

year_and_ner = raot_cta_ner[["YEAR_BUILT", "NER_T4"]]

In [32]:
from sklearn.preprocessing import StandardScaler

# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(year_and_ner)

# Transform the training data using the scaler
year_and_ner_scaled = X_scaler.transform(year_and_ner)

In [33]:
year_and_ner_scaled

array([[ 1.30150834,  0.12492686],
       [-0.4890748 ,  0.34489485],
       [ 1.23755895,  0.64718938],
       ...,
       [ 1.42940714,  0.8532466 ],
       [-1.00066998, -1.388266  ],
       [-1.12856878, -1.31483134]])

In [34]:
model.fit(year_and_ner_scaled)



In [39]:
# Make predictions about the data clusters using the trained model
property_ratings = model.predict(year_and_ner_scaled)

# Create a copy of the DataFrame
year_and_ner_predictions_df = year_and_ner.copy()

# Add a column to the DataFrame that contains the customer_ratings information
year_and_ner_predictions_df['property ratings'] = property_ratings

# Difine Dict with the key-value pair to remap.
map_dict = {0 : 'A', 1 : 'C', 2: 'B'}
year_and_ner_predictions_df=year_and_ner_predictions_df.replace({"property ratings": map_dict})

# Review the DataFrame
property_data = raot_cta_ner[['NAME', 'ID']]
result = pd.merge(property_data, year_and_ner_predictions_df, left_index=True, right_index=True, how='outer')
result.head()

Unnamed: 0,NAME,ID,YEAR_BUILT,NER_T4,property ratings
0,Christopher Todd Communities At Country Place,5e7e7f86573c70002fcd6196,2018,1711.277055,A
1,The Palisades at Paradise Valley Mall,56ca8f47b8fc2f0e00b8016c,1990,1776.89832,B
2,Peak 16,5a04dd9ddd8418001d408527,2017,1867.079398,A
3,San Portella Apartments,5778281acc8a0d0e00a53c9c,2008,2011.178571,A
4,Park Tower Apartments,58dd528686f8a8000b02c124,1986,1534.266666,C


In [40]:
# Plot the data points based on the customer rating
result.hvplot.scatter(
    x="YEAR_BUILT", 
    y="NER_T4", 
    by="property ratings"
)

In [41]:
year_and_ner_predictions_df['property ratings'].value_counts()

# add average square feet

B    221
C    207
A    174
Name: property ratings, dtype: int64