In [3]:
import pandas as pd
from sklearn.ensemble import IsolationForest

def remove_outliers_isolation_forest(df_path, contamination=0.2, return_outliers=False, 
                                     output_path='cleaned_data.csv'):
    # Check if input is an Excel file and convert to CSV
    if df_path.endswith('.xlsx'):
        excel_data = pd.read_excel(df_path)
        csv_path = 'temp_data.csv'
        excel_data.to_csv(csv_path, index=False)
        df = pd.read_csv(csv_path)
    else:
        df = pd.read_csv(df_path)
    
    # 1. Select numeric columns
    X = df.select_dtypes(include=[float, int])
    print(f'before: {len(X)}')

    # 2. Fit Isolation Forest
    iso = IsolationForest(contamination=contamination)
    outlier_pred = iso.fit_predict(X)  # -1 = outlier, 1 = inlier

    # 3. Append prediction to DataFrame
    df_temp = df.copy()
    df_temp['is_outlier'] = outlier_pred

    # 4. Filter rows
    df_clean = df_temp[df_temp['is_outlier'] == 1].drop(columns='is_outlier')

    # Save cleaned DataFrame to CSV
    df_clean.to_csv(output_path, index=False)

    if return_outliers:
        df_outliers = df_temp[df_temp['is_outlier'] == -1].drop(columns='is_outlier')
        return df_clean, df_outliers

    print(f'after: {len(df_clean)}')
    return df_clean



In [4]:
cleaned_df = remove_outliers_isolation_forest('20250624/20250624_timeslot_1.xlsx', output_path='20250624/20250624_timeslot_1_cleaned.csv')

before: 1351
after: 1081


In [5]:
cleaned_df = remove_outliers_isolation_forest('20250624/20250624_timeslot_2.xlsx', output_path='20250624/20250624_timeslot_2_cleaned.csv')

before: 684
after: 547


In [6]:
cleaned_df = remove_outliers_isolation_forest('20250625/20250625_timeslot_1.xlsx', output_path='20250625/20250625_timeslot_1_cleaned.csv')

before: 1436
after: 1149


In [8]:
cleaned_df = remove_outliers_isolation_forest('20250626/20250626_timeslot_1.xlsx', output_path='20250626/20250626_timeslot_1_cleaned.csv')

before: 1881
after: 1505


In [9]:
cleaned_df = remove_outliers_isolation_forest('20250627/20250627_timeslot_1.xlsx', output_path='20250627/20250627_timeslot_1_cleaned.csv')

before: 1827
after: 1461


In [10]:
cleaned_df = remove_outliers_isolation_forest('20250628/20250628_timeslot_1.xlsx', output_path='20250628/20250628_timeslot_1_cleaned.csv')

before: 1920
after: 1536


In [11]:
cleaned_df = remove_outliers_isolation_forest('20250629/20250629_timeslot_1.xlsx', output_path='20250629/20250629_timeslot_1_cleaned.csv')

before: 1116
after: 893


In [12]:
cleaned_df = remove_outliers_isolation_forest('20250630/20250630_timeslot_1.xlsx', output_path='20250630/20250630_timeslot_1_cleaned.csv')

before: 2002
after: 1601


In [58]:
cleaned_df = remove_outliers_isolation_forest('20250529/20250529_timeslot_3_Derivative_data.csv', output_path='20250529/20250529_timeslot_3_Derivative_data_cleaned.csv')

before: 869
after: 782


In [59]:
cleaned_df = remove_outliers_isolation_forest('20250530/20250530_timeslot_1.csv', output_path='20250530/20250530_timeslot_1_cleaned.csv')

before: 1432
after: 1288


In [60]:
cleaned_df = remove_outliers_isolation_forest('20250530/20250530_timeslot_2.csv', output_path='20250530/20250530_timeslot_2_cleaned.csv')

before: 1228
after: 1105


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [61]:
cleaned_df = remove_outliers_isolation_forest('20250530/20250530_timeslot_1_Derivative_data.csv', output_path='20250530/20250530_timeslot_1_Derivative_data_cleaned.csv')

before: 1432
after: 1288


In [62]:
cleaned_df = remove_outliers_isolation_forest('20250530/20250530_timeslot_2_Derivative_data.csv', output_path='20250530/20250530_timeslot_2_Derivative_data_cleaned.csv')

before: 1228
after: 1105


In [63]:
cleaned_df = remove_outliers_isolation_forest('20250531/20250531_timeslot_1.csv', output_path='20250531/20250531_timeslot_1_cleaned.csv')

before: 1250
after: 1125


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [64]:
cleaned_df = remove_outliers_isolation_forest('20250531/20250531_timeslot_2.csv', output_path='20250531/20250531_timeslot_2_cleaned.csv')

before: 609
after: 548


In [65]:
cleaned_df = remove_outliers_isolation_forest('20250531/20250531_timeslot_3.csv', output_path='20250531/20250531_timeslot_3_cleaned.csv')

before: 807
after: 726


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [66]:
cleaned_df = remove_outliers_isolation_forest('20250531/20250531_timeslot_1_Derivative_data.csv', output_path='20250531/20250531_timeslot_1_Derivative_data_cleaned.csv')

before: 1250
after: 1125


In [67]:
cleaned_df = remove_outliers_isolation_forest('20250531/20250531_timeslot_2_Derivative_data.csv', output_path='20250531/20250531_timeslot_2_Derivative_data_cleaned.csv')

before: 609
after: 548


In [68]:
cleaned_df = remove_outliers_isolation_forest('20250531/20250531_timeslot_3_Derivative_data.csv', output_path='20250531/20250531_timeslot_3_Derivative_data_cleaned.csv')

before: 807
after: 726


In [69]:
cleaned_df = remove_outliers_isolation_forest('20250601/20250601_timeslot_1.csv', output_path='20250601/20250601_timeslot_1_cleaned.csv')

before: 1152
after: 1036


In [70]:
cleaned_df = remove_outliers_isolation_forest('20250601/20250601_timeslot_2.csv', output_path='20250601/20250601_timeslot_2_cleaned.csv')

before: 671
after: 604


In [71]:
cleaned_df = remove_outliers_isolation_forest('20250601/20250601_timeslot_3.csv', output_path='20250601/20250601_timeslot_3_cleaned.csv')

before: 830
after: 747


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [72]:
cleaned_df = remove_outliers_isolation_forest('20250601/20250601_timeslot_1_Derivative_data.csv', output_path='20250601/20250601_timeslot_1_Derivative_data_cleaned.csv')

before: 1152
after: 1036


In [73]:
cleaned_df = remove_outliers_isolation_forest('20250601/20250601_timeslot_2_Derivative_data.csv', output_path='20250601/20250601_timeslot_2_Derivative_data_cleaned.csv')

before: 671
after: 604


In [74]:
cleaned_df = remove_outliers_isolation_forest('20250601/20250601_timeslot_3_Derivative_data.csv', output_path='20250601/20250601_timeslot_3_Derivative_data_cleaned.csv')

before: 830
after: 747


In [75]:
cleaned_df = remove_outliers_isolation_forest('20250602/20250602_timeslot_1.csv', output_path='20250602/20250602_timeslot_1_cleaned.csv')

before: 1251
after: 1126


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [76]:
cleaned_df = remove_outliers_isolation_forest('20250602/20250602_timeslot_2.csv', output_path='20250602/20250602_timeslot_2_cleaned.csv')

before: 785
after: 706


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [77]:
cleaned_df = remove_outliers_isolation_forest('20250602/20250602_timeslot_3.csv', output_path='20250602/20250602_timeslot_3_cleaned.csv')

before: 783
after: 704


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [78]:
cleaned_df = remove_outliers_isolation_forest('20250602/20250602_timeslot_1_Derivative_data.csv', output_path='20250602/20250602_timeslot_1_Derivative_data_cleaned.csv')

before: 1251
after: 1126


In [79]:
cleaned_df = remove_outliers_isolation_forest('20250602/20250602_timeslot_2_Derivative_data.csv', output_path='20250602/20250602_timeslot_2_Derivative_data_cleaned.csv')

before: 785
after: 706


In [80]:
cleaned_df = remove_outliers_isolation_forest('20250602/20250602_timeslot_3_Derivative_data.csv', output_path='20250602/20250602_timeslot_3_Derivative_data_cleaned.csv')

before: 783
after: 704


In [5]:
cleaned_df = remove_outliers_isolation_forest('20250528/20250528_all_timeslot_selected.csv', output_path='20250528/20250528_all_timeslot_cleaned_selected.csv')

before: 3069


ValueError: Must have equal len keys and value when setting with an ndarray