Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement design doc changes #128

Merged
merged 55 commits into from
Jun 18, 2024
Merged
Show file tree
Hide file tree
Changes from 50 commits
Commits
Show all changes
55 commits
Select commit Hold shift + click to select a range
0451eed
Rename reason codes from issue 125
wagnerlmichael May 29, 2024
fa88d08
Add beginning logic for new flags
wagnerlmichael May 29, 2024
d828a8d
Intitial 3 column outlier_reasons working
wagnerlmichael May 30, 2024
fcb5d73
Add functioning 3 column output
wagnerlmichael May 31, 2024
4fea1e6
Remove print statements
wagnerlmichael May 31, 2024
b16bb36
Fix ptax classifiers
wagnerlmichael May 31, 2024
677d440
Change comments
wagnerlmichael May 31, 2024
364da45
Fix raw and sqft condition
wagnerlmichael Jun 4, 2024
b2eae8b
Remove comments
wagnerlmichael Jun 4, 2024
fe85714
Edit docstring and comments
wagnerlmichael Jun 4, 2024
16f3594
Remove condos bool
wagnerlmichael Jun 5, 2024
270fd09
Fix conditions based on design doc spec
wagnerlmichael Jun 5, 2024
44e866d
Update glue/flagging_script_glue/flagging.py
wagnerlmichael Jun 5, 2024
9d69aae
Fix conditions
wagnerlmichael Jun 5, 2024
c1b463c
Merge branch 'implement-design-doc-changes' of https://github.com/cca…
wagnerlmichael Jun 5, 2024
2c26d2a
Start re-work
wagnerlmichael Jun 5, 2024
e94cdf5
Start re-work
wagnerlmichael Jun 5, 2024
8d55514
Establish sorting logic
wagnerlmichael Jun 6, 2024
88828ec
Edit comment
wagnerlmichael Jun 6, 2024
0109d54
Correctly classify outliers
wagnerlmichael Jun 6, 2024
deb1c19
Remove sv_is_outlier logic from mansueto script
wagnerlmichael Jun 6, 2024
67f7dea
Add comments and un-comment things
wagnerlmichael Jun 6, 2024
41f405b
Remove unneeded code
wagnerlmichael Jun 6, 2024
bd289e9
Edit comments
wagnerlmichael Jun 6, 2024
d4b3d61
Remove doc string component
wagnerlmichael Jun 6, 2024
7fd0949
Fix condos conditional
wagnerlmichael Jun 6, 2024
c836c62
Add Low price to values to check
wagnerlmichael Jun 7, 2024
e4b9fa6
Update docs
wagnerlmichael Jun 7, 2024
7b1597d
Change comment
wagnerlmichael Jun 7, 2024
3dbfbcf
Edit docstring
wagnerlmichael Jun 7, 2024
3bc8966
Remove docs
wagnerlmichael Jun 7, 2024
0595ea0
Remove comment
wagnerlmichael Jun 7, 2024
ca0d7c7
Update glue/sales_val_flagging.py
wagnerlmichael Jun 10, 2024
bd01a76
Remove na column creation
wagnerlmichael Jun 10, 2024
38b99c7
Try refactor of sqft separation
wagnerlmichael Jun 10, 2024
457fa8e
Improve ptax reference
wagnerlmichael Jun 10, 2024
525eed0
Replace string null with np nan
wagnerlmichael Jun 10, 2024
38f1d44
Re-factor classify_outliers
wagnerlmichael Jun 11, 2024
fc1ae01
Remove dict
wagnerlmichael Jun 11, 2024
272036a
Update glue/sales_val_flagging.py
wagnerlmichael Jun 11, 2024
1bb883c
Update glue/sales_val_flagging.py
wagnerlmichael Jun 11, 2024
243d4b6
Update glue/sales_val_flagging.py
wagnerlmichael Jun 11, 2024
e6a6faa
Fix idx ref
wagnerlmichael Jun 11, 2024
3166915
Merge branch 'main' into implement-design-doc-changes
wagnerlmichael Jun 12, 2024
22da575
Handle condos sqft indictoar error and change group thresh handling
wagnerlmichael Jun 12, 2024
c7e87a3
Fix comma problem
wagnerlmichael Jun 12, 2024
a36aceb
Fix dtypes
wagnerlmichael Jun 13, 2024
feb56d7
Edit workflow to disentangle ptax and price
wagnerlmichael Jun 14, 2024
7efa1b4
Add documentation
wagnerlmichael Jun 14, 2024
f7c811b
Add documentation
wagnerlmichael Jun 14, 2024
f6490d1
Remove todo
wagnerlmichael Jun 14, 2024
0d8c983
Simplify func
wagnerlmichael Jun 17, 2024
e532d25
Restore yaml
wagnerlmichael Jun 17, 2024
9823a81
Restore yaml
wagnerlmichael Jun 17, 2024
9c8a354
Add docs
wagnerlmichael Jun 17, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
193 changes: 57 additions & 136 deletions glue/flagging_script_glue/flagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,12 +83,8 @@ def outlier_taxonomy(df: pd.DataFrame, permut: tuple, groups: tuple, condos: boo
"""

df = check_days(df, SHORT_TERM_OWNER_THRESHOLD)

df = pricing_info(df, permut, groups, condos=condos)

df = outlier_type(df, condos=condos)
df = outlier_flag(df)
df = special_flag(df)

return df

Expand Down Expand Up @@ -270,26 +266,6 @@ def pricing_info(
return df


def special_flag(df: pd.DataFrame) -> pd.DataFrame:
"""
Creates column that checks whether there is a special flag for this record.
Inputs:
df (pd.DataFrame): dataframe to add flags onto
Outputs:
df (pd.DataFrame): dataframe with 'special_flags' column
"""
cond = [
(df["sv_name_match"] != "No match"),
(df["sv_short_owner"] == "Short-term owner"),
(df["sv_transaction_type"] == "legal_entity-legal_entity"),
]
labels = ["Family sale", "Home flip sale", "Non-person sale"]

df["sv_special_flags"] = np.select(cond, labels, default="Not special")

return df


jeancochrane marked this conversation as resolved.
Show resolved Hide resolved
def which_price(row: pd.Series, thresholds: dict, groups: tuple) -> str:
"""
Determines whether sale_price, price_per_sqft, or both are outliers,
Expand Down Expand Up @@ -598,11 +574,10 @@ def get_sale_counts(dups: pd.DataFrame) -> pd.DataFrame:
v_counts = (
dups.pin.value_counts()
.reset_index()
.rename(columns={"index": "pin", "pin": "sv_sale_dup_counts"})
.rename(columns={"count": "sv_sale_dup_counts"})
)

# Explicitly specify the merging columns
dups = pd.merge(dups, v_counts, on="pin")
dups = pd.merge(dups, v_counts)

return dups

Expand Down Expand Up @@ -767,131 +742,77 @@ def z_normalize_groupby(s: pd.Series):

def outlier_type(df: pd.DataFrame, condos: bool) -> pd.DataFrame:
"""
Runs np.select that creates an outlier taxonomy.
This function create indicator columns for each distinct outlier type between price
and characteristic outliers. These columns are prefixed with 'sv_ind_'.

Inputs:
df (pd.DataFrame): dataframe with necessary columns created from previous functions.
df (pd.DataFrame): Dataframe
Outputs:
df (pd.DataFrame): dataframe with 'sv_outlier_type' column.
df (pd.DataFrame): Dataframe with indicator columns for each flag type
"""
if condos == True:
conditions = [
(df["sv_short_owner"] == "Short-term owner")
& (df["sv_pricing"].str.contains("High")),
(df["sv_name_match"] != "No match")
& (df["sv_pricing"].str.contains("High")),
(
df[["sv_buyer_category", "sv_seller_category"]]
.eq("legal_entity")
.any(axis=1)
)
& (df["sv_pricing"].str.contains("High")),
(df["sv_anomaly"] == "Outlier") & (df["sv_pricing"].str.contains("High")),
(df["sv_pricing"].str.contains("High price swing")),
(df["sv_pricing"].str.contains("High")),
(df["sv_short_owner"] == "Short-term owner")
& (df["sv_pricing"].str.contains("Low")),
(df["sv_name_match"] != "No match")
& (df["sv_pricing"].str.contains("Low")),
(
df[["sv_buyer_category", "sv_seller_category"]]
.eq("legal_entity")
.any(axis=1)
)
& (df["sv_pricing"].str.contains("Low")),
(df["sv_anomaly"] == "Outlier") & (df["sv_pricing"].str.contains("Low")),
(df["sv_pricing"].str.contains("Low price swing")),
(df["sv_pricing"].str.contains("Low")),

char_conditions = [
df["sv_short_owner"] == "Short-term owner",
df["sv_name_match"] != "No match",
df[["sv_buyer_category", "sv_seller_category"]].eq("legal_entity").any(axis=1),
df["sv_anomaly"] == "Outlier",
df["sv_pricing"].str.contains("High price swing")
| df["sv_pricing"].str.contains("Low price swing"),
]

# Define labels for characteristic-based reasons
char_labels = [
"sv_ind_char_short_term_owner",
"sv_ind_char_family_sale",
"sv_ind_char_non_person_sale",
"sv_ind_char_statistical_anomaly",
"sv_ind_char_price_swing_homeflip",
]

if condos:
# Define conditions for price-based reasons
price_conditions = [
df["sv_pricing"].str.contains("High"),
df["sv_pricing"].str.contains("Low"),
]

labels = [
"Home flip sale (high)",
"Family sale (high)",
"Non-person sale (high)",
"Anomaly (high)",
"High price swing",
"High price (raw)",
"Home flip sale (low)",
"Family sale (low)",
"Non-person sale (low)",
"Anomaly (low)",
"Low price swing",
"Low price (raw)",
# Define labels for price-based reasons
price_labels = [
"sv_ind_price_high_price",
"sv_ind_price_low_price",
]

else:
conditions = [
(df["sv_short_owner"] == "Short-term owner")
& (df["sv_pricing"].str.contains("High")),
(df["sv_name_match"] != "No match")
& (df["sv_pricing"].str.contains("High")),
# Define conditions for price-based reasons
price_conditions = [
(
df[["sv_buyer_category", "sv_seller_category"]]
.eq("legal_entity")
.any(axis=1)
)
& (df["sv_pricing"].str.contains("High")),
(df["sv_anomaly"] == "Outlier") & (df["sv_pricing"].str.contains("High")),
(df["sv_pricing"].str.contains("High price swing")),
(df["sv_pricing"].str.contains("High"))
& (df["sv_which_price"] == "(raw & sqft)"),
(df["sv_pricing"].str.contains("High")) & (df["sv_which_price"] == "(raw)"),
(df["sv_pricing"].str.contains("High"))
& (df["sv_which_price"] == "(sqft)"),
(df["sv_short_owner"] == "Short-term owner")
& (df["sv_pricing"].str.contains("Low")),
(df["sv_name_match"] != "No match")
& (df["sv_pricing"].str.contains("Low")),
df["sv_pricing"].str.contains("High")
& (df["sv_which_price"].str.contains("raw"))
),
(
df[["sv_buyer_category", "sv_seller_category"]]
.eq("legal_entity")
.any(axis=1)
)
& (df["sv_pricing"].str.contains("Low")),
(df["sv_anomaly"] == "Outlier") & (df["sv_pricing"].str.contains("Low")),
(df["sv_pricing"].str.contains("Low price swing")),
df["sv_pricing"].str.contains("Low")
& (df["sv_which_price"].str.contains("raw"))
),
(df["sv_pricing"].str.contains("High"))
& (df["sv_which_price"].str.contains("sqft")),
(df["sv_pricing"].str.contains("Low"))
& (df["sv_which_price"] == "(raw & sqft)"),
(df["sv_pricing"].str.contains("Low")) & (df["sv_which_price"] == "(raw)"),
(df["sv_pricing"].str.contains("Low")) & (df["sv_which_price"] == "(sqft)"),
& (df["sv_which_price"].str.contains("sqft")),
]

labels = [
"Home flip sale (high)",
"Family sale (high)",
"Non-person sale (high)",
"Anomaly (high)",
"High price swing",
"High price (raw & sqft)",
"High price (raw)",
"High price (sqft)",
"Home flip sale (low)",
"Family sale (low)",
"Non-person sale (low)",
"Anomaly (low)",
"Low price swing",
"Low price (raw & sqft)",
"Low price (raw)",
"Low price (sqft)",
# Define labels for price-based reasons
price_labels = [
"sv_ind_price_high_price",
"sv_ind_price_low_price",
"sv_ind_price_high_price_sqft",
"sv_ind_price_low_price_sqft",
]

df["sv_outlier_type"] = np.select(conditions, labels, default="Not outlier")
combined_conditions = price_conditions + char_conditions
combined_labels = price_labels + char_labels

return df


def outlier_flag(df: pd.DataFrame) -> pd.DataFrame:
"""
Creates a flag that shows whether the record is an
outlier (a special flag) according to our outlier taxonomy.
Inputs:
df (pd.DataFrame): dataframe to create outlier flag
Outputs:
df (pd.DataFrame): dataframe with 'is_outlier' column
"""

df["sv_is_outlier"] = np.select(
[(df["sv_outlier_type"] == "Not outlier")], [0], default=1
)
# Create indicator columns for each flag type
for label, condition in zip(combined_labels, combined_conditions):
df[label] = condition.astype(int)

return df

Expand Down
Loading
Loading