ccao-data · wagnerlmichael · Jun 18, 2024 · May 29, 2024 · May 29, 2024 · May 30, 2024
diff --git a/glue/flagging_script_glue/flagging.py b/glue/flagging_script_glue/flagging.py
@@ -598,11 +598,11 @@ def get_sale_counts(dups: pd.DataFrame) -> pd.DataFrame:
     v_counts = (
         dups.pin.value_counts()
         .reset_index()
-        .rename(columns={"index": "pin", "pin": "sv_sale_dup_counts"})
+        .rename(columns={"count": "sv_sale_dup_counts"})
     )
 
     # Explicitly specify the merging columns
-    dups = pd.merge(dups, v_counts, on="pin")
+    dups = pd.merge(dups, v_counts)
 
     return dups
 
@@ -767,114 +767,156 @@ def z_normalize_groupby(s: pd.Series):
 
 def outlier_type(df: pd.DataFrame, condos: bool) -> pd.DataFrame:
     """
-    Runs np.select that creates an outlier taxonomy.
+    Enhances the dataframe by adding two columns for outlier reasons:
+    - sv_char_outlier_reason: Reasons based on characteristics including price swings.
+    - sv_char_price_reason: Reasons based on high or low pricing levels.
+
     Inputs:
-        df (pd.DataFrame): dataframe with necessary columns created from previous functions.
+        df (pd.DataFrame): Dataframe with necessary columns created from previous functions.
     Outputs:
-        df (pd.DataFrame): dataframe with 'sv_outlier_type' column.
+        df (pd.DataFrame): Dataframe with 'sv_char_outlier_reason' and 'sv_char_price_reason' columns.
     """
-    if condos == True:
-        conditions = [
-            (df["sv_short_owner"] == "Short-term owner")
-            & (df["sv_pricing"].str.contains("High")),
-            (df["sv_name_match"] != "No match")
-            & (df["sv_pricing"].str.contains("High")),
-            (
-                df[["sv_buyer_category", "sv_seller_category"]]
-                .eq("legal_entity")
-                .any(axis=1)
-            )
-            & (df["sv_pricing"].str.contains("High")),
-            (df["sv_anomaly"] == "Outlier") & (df["sv_pricing"].str.contains("High")),
-            (df["sv_pricing"].str.contains("High price swing")),
-            (df["sv_pricing"].str.contains("High")),
-            (df["sv_short_owner"] == "Short-term owner")
-            & (df["sv_pricing"].str.contains("Low")),
-            (df["sv_name_match"] != "No match")
-            & (df["sv_pricing"].str.contains("Low")),
-            (
-                df[["sv_buyer_category", "sv_seller_category"]]
-                .eq("legal_entity")
-                .any(axis=1)
-            )
-            & (df["sv_pricing"].str.contains("Low")),
-            (df["sv_anomaly"] == "Outlier") & (df["sv_pricing"].str.contains("Low")),
-            (df["sv_pricing"].str.contains("Low price swing")),
-            (df["sv_pricing"].str.contains("Low")),
+
+    if condos:
+        # Define conditions for characteristic-based reasons
+        char_conditions = [
+            df["sv_short_owner"] == "Short-term owner",
+            df["sv_name_match"] != "No match",
+            df[["sv_buyer_category", "sv_seller_category"]]
+            .eq("legal_entity")
+            .any(axis=1),
+            df["sv_anomaly"] == "Outlier",
+            df["sv_pricing"].str.contains("High price swing"),
+            df["sv_pricing"].str.contains("Low price swing"),
         ]
 
-        labels = [
-            "Home flip sale (high)",
-            "Family sale (high)",
-            "Non-person sale (high)",
-            "Anomaly (high)",
+        # Define labels for characteristic-based reasons
+        char_labels = [
+            "Short-term owner",
+            "Family sale",
+            "Non-person sale",
+            "Statistical anomaly",
             "High price swing",
-            "High price (raw)",
-            "Home flip sale (low)",
-            "Family sale (low)",
-            "Non-person sale (low)",
-            "Anomaly (low)",
             "Low price swing",
-            "Low price (raw)",
         ]
 
+        # Define conditions for price-based reasons
+        price_conditions = [
+            df["sv_pricing"].str.contains("High"),
+            df["sv_pricing"].str.contains("Low"),
+        ]
+
+        # Define labels for price-based reasons
+        price_labels = [
+            "High price",
+            "Low price",
+        ]
+
+        # Apply np.select to create the new columns
+        df["sv_char_outlier_reason"] = np.select(
+            char_conditions, char_labels, default="Not outlier"
+        )
+        df["sv_char_price_reason"] = np.select(
+            price_conditions, price_labels, default="Not outlier"
+        )
+
     else:
-        conditions = [
-            (df["sv_short_owner"] == "Short-term owner")
-            & (df["sv_pricing"].str.contains("High")),
-            (df["sv_name_match"] != "No match")
-            & (df["sv_pricing"].str.contains("High")),
-            (
-                df[["sv_buyer_category", "sv_seller_category"]]
-                .eq("legal_entity")
-                .any(axis=1)
-            )
-            & (df["sv_pricing"].str.contains("High")),
-            (df["sv_anomaly"] == "Outlier") & (df["sv_pricing"].str.contains("High")),
-            (df["sv_pricing"].str.contains("High price swing")),
+        # Define conditions for characteristic-based reasons
+        char_conditions = [
+            df["sv_short_owner"] == "Short-term owner",
+            df["sv_name_match"] != "No match",
+            df[["sv_buyer_category", "sv_seller_category"]]
+            .eq("legal_entity")
+            .any(axis=1),
+            df["sv_anomaly"] == "Outlier",
+            df["sv_pricing"].str.contains("High price swing"),
+            df["sv_pricing"].str.contains("Low price swing"),
+        ]
+
+        # Define labels for characteristic-based reasons
+        char_labels = [
+            "Short-term owner",
+            "Family sale",
+            "Non-person sale",
+            "Statistical anomaly",
+            "High price swing",
+            "Low price swing",
+        ]
+
+        # Define conditions for price-based reasons
+        price_conditions = [
             (df["sv_pricing"].str.contains("High"))
             & (df["sv_which_price"] == "(raw & sqft)"),
-            (df["sv_pricing"].str.contains("High")) & (df["sv_which_price"] == "(raw)"),
-            (df["sv_pricing"].str.contains("High"))
-            & (df["sv_which_price"] == "(sqft)"),
-            (df["sv_short_owner"] == "Short-term owner")
-            & (df["sv_pricing"].str.contains("Low")),
-            (df["sv_name_match"] != "No match")
-            & (df["sv_pricing"].str.contains("Low")),
-            (
-                df[["sv_buyer_category", "sv_seller_category"]]
-                .eq("legal_entity")
-                .any(axis=1)
-            )
-            & (df["sv_pricing"].str.contains("Low")),
-            (df["sv_anomaly"] == "Outlier") & (df["sv_pricing"].str.contains("Low")),
-            (df["sv_pricing"].str.contains("Low price swing")),
             (df["sv_pricing"].str.contains("Low"))
             & (df["sv_which_price"] == "(raw & sqft)"),
-            (df["sv_pricing"].str.contains("Low")) & (df["sv_which_price"] == "(raw)"),
+            (df["sv_pricing"].str.contains("High") & (df["sv_which_price"] == "(raw)")),
+            (df["sv_pricing"].str.contains("Low") & (df["sv_which_price"] == "(raw)")),
+            (df["sv_pricing"].str.contains("High"))
+            & (df["sv_which_price"] == "(sqft)"),
             (df["sv_pricing"].str.contains("Low")) & (df["sv_which_price"] == "(sqft)"),
         ]
 
-        labels = [
-            "Home flip sale (high)",
-            "Family sale (high)",
-            "Non-person sale (high)",
-            "Anomaly (high)",
-            "High price swing",
+        # Define labels for price-based reasons
+        price_labels = [
             "High price (raw & sqft)",
-            "High price (raw)",
-            "High price (sqft)",
-            "Home flip sale (low)",
-            "Family sale (low)",
-            "Non-person sale (low)",
-            "Anomaly (low)",
-            "Low price swing",
             "Low price (raw & sqft)",
-            "Low price (raw)",
+            "High price",
+            "Low price",
+            "High price (sqft)",
             "Low price (sqft)",
         ]
 
-    df["sv_outlier_type"] = np.select(conditions, labels, default="Not outlier")
+    # Combined conditions and labels for reason1 with fallback to character conditions if no price condition is met
+    combined_conditions = price_conditions + char_conditions
+    combined_labels = price_labels + char_labels
+
+    df["sv_outlier_reason1"] = np.select(
+        combined_conditions, combined_labels, default="Not outlier"
+    )
+
+    # Adjust for reason2 based on remaining price conditions and char conditions not used in reason1
+    remaining_conditions = [
+        (cond & (df["sv_outlier_reason1"] != label))
+        for cond, label in zip(combined_conditions, combined_labels)
+    ]
+    df["sv_outlier_reason2"] = np.select(
+        remaining_conditions, combined_labels, default="Not outlier"
+    )
+
+    def find_next_unused_char(row, df):
+        used_labels = {row["sv_outlier_reason1"], row["sv_outlier_reason2"]}
+        for cond, label in zip(char_conditions, char_labels):
+            # Evaluate the condition directly for the current row
+            if label not in used_labels and cond[row.name]:
+                return label
+        return "Not outlier"
+
+    # Now we apply this function row-wise correctly by also passing the DataFrame reference:
+    df["sv_outlier_reason3"] = df.apply(find_next_unused_char, df=df, axis=1)
+
+    def transform_row(row):
+        if row["sv_outlier_reason1"] in [
+            "Low price (raw & sqft)",
+            "High price (raw & sqft)",
+        ]:
+            # Determine price type based on the value in sv_outlier_reason1
+            price_type = (
+                "Low price" if "Low" in row["sv_outlier_reason1"] else "High price"
+            )
+
+            # Update sv_outlier_reason1 to just 'Low price' or 'High price'
+            row["sv_outlier_reason1"] = price_type
+
+            # Move the current sv_outlier_reason2 to sv_outlier_reason3
+            row["sv_outlier_reason3"] = row["sv_outlier_reason2"]
+
+            # Update sv_outlier_reason2 to the new value '(sqft)'
+            row["sv_outlier_reason2"] = f"{price_type} (sqft)"
+
+        return row
+
+    # Apply the transformation
+    df = df.apply(transform_row, axis=1)
 
     return df
 
@@ -889,8 +931,17 @@ def outlier_flag(df: pd.DataFrame) -> pd.DataFrame:
         df (pd.DataFrame): dataframe with 'is_outlier' column
     """
 
+    options = ["High price", "Low price", "High price (sqft)", "Low price (sqft)"]
+    pattern = r"\b(?:" + "|".join(map(re.escape, options)) + r")\b"
+
     df["sv_is_outlier"] = np.select(
-        [(df["sv_outlier_type"] == "Not outlier")], [0], default=1
+        [
+            df["sv_outlier_reason1"].str.contains(
+                pattern, case=False, na=False, regex=True
+            )
+        ],
+        [1],
+        default=0,
     )
 
     return df