Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement design doc changes #128

Merged
merged 55 commits into from
Jun 18, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
55 commits
Select commit Hold shift + click to select a range
0451eed
Rename reason codes from issue 125
wagnerlmichael May 29, 2024
fa88d08
Add beginning logic for new flags
wagnerlmichael May 29, 2024
d828a8d
Intitial 3 column outlier_reasons working
wagnerlmichael May 30, 2024
fcb5d73
Add functioning 3 column output
wagnerlmichael May 31, 2024
4fea1e6
Remove print statements
wagnerlmichael May 31, 2024
b16bb36
Fix ptax classifiers
wagnerlmichael May 31, 2024
677d440
Change comments
wagnerlmichael May 31, 2024
364da45
Fix raw and sqft condition
wagnerlmichael Jun 4, 2024
b2eae8b
Remove comments
wagnerlmichael Jun 4, 2024
fe85714
Edit docstring and comments
wagnerlmichael Jun 4, 2024
16f3594
Remove condos bool
wagnerlmichael Jun 5, 2024
270fd09
Fix conditions based on design doc spec
wagnerlmichael Jun 5, 2024
44e866d
Update glue/flagging_script_glue/flagging.py
wagnerlmichael Jun 5, 2024
9d69aae
Fix conditions
wagnerlmichael Jun 5, 2024
c1b463c
Merge branch 'implement-design-doc-changes' of https://github.com/cca…
wagnerlmichael Jun 5, 2024
2c26d2a
Start re-work
wagnerlmichael Jun 5, 2024
e94cdf5
Start re-work
wagnerlmichael Jun 5, 2024
8d55514
Establish sorting logic
wagnerlmichael Jun 6, 2024
88828ec
Edit comment
wagnerlmichael Jun 6, 2024
0109d54
Correctly classify outliers
wagnerlmichael Jun 6, 2024
deb1c19
Remove sv_is_outlier logic from mansueto script
wagnerlmichael Jun 6, 2024
67f7dea
Add comments and un-comment things
wagnerlmichael Jun 6, 2024
41f405b
Remove unneeded code
wagnerlmichael Jun 6, 2024
bd289e9
Edit comments
wagnerlmichael Jun 6, 2024
d4b3d61
Remove doc string component
wagnerlmichael Jun 6, 2024
7fd0949
Fix condos conditional
wagnerlmichael Jun 6, 2024
c836c62
Add Low price to values to check
wagnerlmichael Jun 7, 2024
e4b9fa6
Update docs
wagnerlmichael Jun 7, 2024
7b1597d
Change comment
wagnerlmichael Jun 7, 2024
3dbfbcf
Edit docstring
wagnerlmichael Jun 7, 2024
3bc8966
Remove docs
wagnerlmichael Jun 7, 2024
0595ea0
Remove comment
wagnerlmichael Jun 7, 2024
ca0d7c7
Update glue/sales_val_flagging.py
wagnerlmichael Jun 10, 2024
bd01a76
Remove na column creation
wagnerlmichael Jun 10, 2024
38b99c7
Try refactor of sqft separation
wagnerlmichael Jun 10, 2024
457fa8e
Improve ptax reference
wagnerlmichael Jun 10, 2024
525eed0
Replace string null with np nan
wagnerlmichael Jun 10, 2024
38f1d44
Re-factor classify_outliers
wagnerlmichael Jun 11, 2024
fc1ae01
Remove dict
wagnerlmichael Jun 11, 2024
272036a
Update glue/sales_val_flagging.py
wagnerlmichael Jun 11, 2024
1bb883c
Update glue/sales_val_flagging.py
wagnerlmichael Jun 11, 2024
243d4b6
Update glue/sales_val_flagging.py
wagnerlmichael Jun 11, 2024
e6a6faa
Fix idx ref
wagnerlmichael Jun 11, 2024
3166915
Merge branch 'main' into implement-design-doc-changes
wagnerlmichael Jun 12, 2024
22da575
Handle condos sqft indictoar error and change group thresh handling
wagnerlmichael Jun 12, 2024
c7e87a3
Fix comma problem
wagnerlmichael Jun 12, 2024
a36aceb
Fix dtypes
wagnerlmichael Jun 13, 2024
feb56d7
Edit workflow to disentangle ptax and price
wagnerlmichael Jun 14, 2024
7efa1b4
Add documentation
wagnerlmichael Jun 14, 2024
f7c811b
Add documentation
wagnerlmichael Jun 14, 2024
f6490d1
Remove todo
wagnerlmichael Jun 14, 2024
0d8c983
Simplify func
wagnerlmichael Jun 17, 2024
e532d25
Restore yaml
wagnerlmichael Jun 17, 2024
9823a81
Restore yaml
wagnerlmichael Jun 17, 2024
9c8a354
Add docs
wagnerlmichael Jun 17, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
229 changes: 140 additions & 89 deletions glue/flagging_script_glue/flagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -598,11 +598,11 @@ def get_sale_counts(dups: pd.DataFrame) -> pd.DataFrame:
v_counts = (
dups.pin.value_counts()
.reset_index()
.rename(columns={"index": "pin", "pin": "sv_sale_dup_counts"})
.rename(columns={"count": "sv_sale_dup_counts"})
)

# Explicitly specify the merging columns
wagnerlmichael marked this conversation as resolved.
Show resolved Hide resolved
dups = pd.merge(dups, v_counts, on="pin")
dups = pd.merge(dups, v_counts)
wagnerlmichael marked this conversation as resolved.
Show resolved Hide resolved

return dups

Expand Down Expand Up @@ -767,114 +767,156 @@ def z_normalize_groupby(s: pd.Series):

def outlier_type(df: pd.DataFrame, condos: bool) -> pd.DataFrame:
"""
Runs np.select that creates an outlier taxonomy.
Enhances the dataframe by adding two columns for outlier reasons:
- sv_char_outlier_reason: Reasons based on characteristics including price swings.
- sv_char_price_reason: Reasons based on high or low pricing levels.

Inputs:
df (pd.DataFrame): dataframe with necessary columns created from previous functions.
df (pd.DataFrame): Dataframe with necessary columns created from previous functions.
wagnerlmichael marked this conversation as resolved.
Show resolved Hide resolved
Outputs:
df (pd.DataFrame): dataframe with 'sv_outlier_type' column.
df (pd.DataFrame): Dataframe with 'sv_char_outlier_reason' and 'sv_char_price_reason' columns.
"""
if condos == True:
conditions = [
(df["sv_short_owner"] == "Short-term owner")
& (df["sv_pricing"].str.contains("High")),
(df["sv_name_match"] != "No match")
& (df["sv_pricing"].str.contains("High")),
(
df[["sv_buyer_category", "sv_seller_category"]]
.eq("legal_entity")
.any(axis=1)
)
& (df["sv_pricing"].str.contains("High")),
(df["sv_anomaly"] == "Outlier") & (df["sv_pricing"].str.contains("High")),
(df["sv_pricing"].str.contains("High price swing")),
(df["sv_pricing"].str.contains("High")),
(df["sv_short_owner"] == "Short-term owner")
& (df["sv_pricing"].str.contains("Low")),
(df["sv_name_match"] != "No match")
& (df["sv_pricing"].str.contains("Low")),
(
df[["sv_buyer_category", "sv_seller_category"]]
.eq("legal_entity")
.any(axis=1)
)
& (df["sv_pricing"].str.contains("Low")),
(df["sv_anomaly"] == "Outlier") & (df["sv_pricing"].str.contains("Low")),
(df["sv_pricing"].str.contains("Low price swing")),
(df["sv_pricing"].str.contains("Low")),

if condos:
# Define conditions for characteristic-based reasons
char_conditions = [
df["sv_short_owner"] == "Short-term owner",
df["sv_name_match"] != "No match",
df[["sv_buyer_category", "sv_seller_category"]]
.eq("legal_entity")
.any(axis=1),
df["sv_anomaly"] == "Outlier",
df["sv_pricing"].str.contains("High price swing"),
df["sv_pricing"].str.contains("Low price swing"),
]

labels = [
"Home flip sale (high)",
"Family sale (high)",
"Non-person sale (high)",
"Anomaly (high)",
# Define labels for characteristic-based reasons
char_labels = [
"Short-term owner",
"Family sale",
"Non-person sale",
"Statistical anomaly",
"High price swing",
"High price (raw)",
"Home flip sale (low)",
"Family sale (low)",
"Non-person sale (low)",
"Anomaly (low)",
"Low price swing",
"Low price (raw)",
]
wagnerlmichael marked this conversation as resolved.
Show resolved Hide resolved

# Define conditions for price-based reasons
price_conditions = [
df["sv_pricing"].str.contains("High"),
df["sv_pricing"].str.contains("Low"),
]

# Define labels for price-based reasons
price_labels = [
"High price",
"Low price",
]

# Apply np.select to create the new columns
df["sv_char_outlier_reason"] = np.select(
char_conditions, char_labels, default="Not outlier"
)
df["sv_char_price_reason"] = np.select(
price_conditions, price_labels, default="Not outlier"
)

else:
conditions = [
(df["sv_short_owner"] == "Short-term owner")
& (df["sv_pricing"].str.contains("High")),
(df["sv_name_match"] != "No match")
& (df["sv_pricing"].str.contains("High")),
(
df[["sv_buyer_category", "sv_seller_category"]]
.eq("legal_entity")
.any(axis=1)
)
& (df["sv_pricing"].str.contains("High")),
(df["sv_anomaly"] == "Outlier") & (df["sv_pricing"].str.contains("High")),
(df["sv_pricing"].str.contains("High price swing")),
# Define conditions for characteristic-based reasons
char_conditions = [
df["sv_short_owner"] == "Short-term owner",
df["sv_name_match"] != "No match",
df[["sv_buyer_category", "sv_seller_category"]]
.eq("legal_entity")
.any(axis=1),
df["sv_anomaly"] == "Outlier",
df["sv_pricing"].str.contains("High price swing"),
df["sv_pricing"].str.contains("Low price swing"),
]

wagnerlmichael marked this conversation as resolved.
Show resolved Hide resolved
# Define labels for characteristic-based reasons
char_labels = [
"Short-term owner",
"Family sale",
"Non-person sale",
"Statistical anomaly",
"High price swing",
"Low price swing",
]

# Define conditions for price-based reasons
price_conditions = [
(df["sv_pricing"].str.contains("High"))
& (df["sv_which_price"] == "(raw & sqft)"),
(df["sv_pricing"].str.contains("High")) & (df["sv_which_price"] == "(raw)"),
(df["sv_pricing"].str.contains("High"))
& (df["sv_which_price"] == "(sqft)"),
(df["sv_short_owner"] == "Short-term owner")
& (df["sv_pricing"].str.contains("Low")),
(df["sv_name_match"] != "No match")
& (df["sv_pricing"].str.contains("Low")),
(
df[["sv_buyer_category", "sv_seller_category"]]
.eq("legal_entity")
.any(axis=1)
)
& (df["sv_pricing"].str.contains("Low")),
(df["sv_anomaly"] == "Outlier") & (df["sv_pricing"].str.contains("Low")),
(df["sv_pricing"].str.contains("Low price swing")),
(df["sv_pricing"].str.contains("Low"))
& (df["sv_which_price"] == "(raw & sqft)"),
(df["sv_pricing"].str.contains("Low")) & (df["sv_which_price"] == "(raw)"),
(df["sv_pricing"].str.contains("High") & (df["sv_which_price"] == "(raw)")),
(df["sv_pricing"].str.contains("Low") & (df["sv_which_price"] == "(raw)")),
(df["sv_pricing"].str.contains("High"))
& (df["sv_which_price"] == "(sqft)"),
(df["sv_pricing"].str.contains("Low")) & (df["sv_which_price"] == "(sqft)"),
]

labels = [
"Home flip sale (high)",
"Family sale (high)",
"Non-person sale (high)",
"Anomaly (high)",
"High price swing",
# Define labels for price-based reasons
price_labels = [
"High price (raw & sqft)",
"High price (raw)",
"High price (sqft)",
"Home flip sale (low)",
"Family sale (low)",
"Non-person sale (low)",
"Anomaly (low)",
"Low price swing",
"Low price (raw & sqft)",
"Low price (raw)",
"High price",
"Low price",
"High price (sqft)",
"Low price (sqft)",
]

df["sv_outlier_type"] = np.select(conditions, labels, default="Not outlier")
# Combined conditions and labels for reason1 with fallback to character conditions if no price condition is met
combined_conditions = price_conditions + char_conditions
combined_labels = price_labels + char_labels

df["sv_outlier_reason1"] = np.select(
combined_conditions, combined_labels, default="Not outlier"
)

# Adjust for reason2 based on remaining price conditions and char conditions not used in reason1
remaining_conditions = [
(cond & (df["sv_outlier_reason1"] != label))
for cond, label in zip(combined_conditions, combined_labels)
]
df["sv_outlier_reason2"] = np.select(
remaining_conditions, combined_labels, default="Not outlier"
)
wagnerlmichael marked this conversation as resolved.
Show resolved Hide resolved

def find_next_unused_char(row, df):
used_labels = {row["sv_outlier_reason1"], row["sv_outlier_reason2"]}
for cond, label in zip(char_conditions, char_labels):
# Evaluate the condition directly for the current row
if label not in used_labels and cond[row.name]:
return label
return "Not outlier"

# Now we apply this function row-wise correctly by also passing the DataFrame reference:
df["sv_outlier_reason3"] = df.apply(find_next_unused_char, df=df, axis=1)
wagnerlmichael marked this conversation as resolved.
Show resolved Hide resolved

def transform_row(row):
if row["sv_outlier_reason1"] in [
"Low price (raw & sqft)",
"High price (raw & sqft)",
]:
# Determine price type based on the value in sv_outlier_reason1
price_type = (
"Low price" if "Low" in row["sv_outlier_reason1"] else "High price"
)

# Update sv_outlier_reason1 to just 'Low price' or 'High price'
row["sv_outlier_reason1"] = price_type

# Move the current sv_outlier_reason2 to sv_outlier_reason3
row["sv_outlier_reason3"] = row["sv_outlier_reason2"]

# Update sv_outlier_reason2 to the new value '(sqft)'
row["sv_outlier_reason2"] = f"{price_type} (sqft)"

return row

# Apply the transformation
df = df.apply(transform_row, axis=1)
wagnerlmichael marked this conversation as resolved.
Show resolved Hide resolved

return df

Expand All @@ -889,8 +931,17 @@ def outlier_flag(df: pd.DataFrame) -> pd.DataFrame:
df (pd.DataFrame): dataframe with 'is_outlier' column
"""

options = ["High price", "Low price", "High price (sqft)", "Low price (sqft)"]
pattern = r"\b(?:" + "|".join(map(re.escape, options)) + r")\b"

df["sv_is_outlier"] = np.select(
[(df["sv_outlier_type"] == "Not outlier")], [0], default=1
[
df["sv_outlier_reason1"].str.contains(
pattern, case=False, na=False, regex=True
)
],
[1],
default=0,
wagnerlmichael marked this conversation as resolved.
Show resolved Hide resolved
)

return df
Expand Down
Loading
Loading