In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import matplotlib.pyplot as plt

# from google.colab import drive
# drive.mount('/content/drive')

# File paths
original_file_path = 'data.csv'
processed_file_path = 'new.csv'

# Check if the processed file exists
if not os.path.exists(processed_file_path):
    data = pd.read_csv(original_file_path)
    data.head()
    data[['metric_date', 'metric_time']] = data['Metric'].str.split(' ', expand=True)
    missing_values = data.isna() | (data == '')
    for col in data.columns:
        if col not in ['metric_date', 'metric_time', 'Metric']:
            for i in range(len(data)):
                if missing_values[col][i]:
                    prev_valid_index = i - 1
                    while prev_valid_index >= 0 and missing_values[col][prev_valid_index]:
                        prev_valid_index -= 1
                    next_valid_index = i + 1
                    while next_valid_index < len(data) and missing_values[col][next_valid_index]:
                        next_valid_index += 1

                    if prev_valid_index < 0:
                        replacement_value = data.at[next_valid_index, col]
                    elif next_valid_index >= len(data):
                        replacement_value = data.at[prev_valid_index, col]
                    else:
                        replacement_value = (data.at[prev_valid_index, col] + data.at[next_valid_index, col]) / 2

                    data.at[i, col] = replacement_value

    data['metric_date'] = pd.to_datetime(data['metric_date'], dayfirst=True)
    data['metric_time'] = pd.to_datetime(data['metric_time']).dt.time

    # Feature Engineering
    data['metric_time'] = data['metric_time'].apply(lambda x: x.hour * 3600 + x.minute * 60 + x.second)
#     data['calculated_power'] = data['AC VOLTAGE-3 (V)'] * data['AC CURRENT-1 (A)']

    initial_X = data[['PM2.5 (µg/m³)', 'WS (m/s)']]
    initial_y = data['PM2.5 (µg/m³)']

    model = LinearRegression()
    model.fit(initial_X, initial_y)

    intercept = model.intercept_
    coef_pm25, coef_ws = model.coef_

    print(intercept, coef_pm25, coef_ws)

    data['dust_conc'] = intercept + coef_pm25 * data['PM2.5 (µg/m³)'] + coef_ws * data['WS (m/s)']

    data.to_csv(processed_file_path, index=False)
else:
    data = pd.read_csv(processed_file_path)

4.263256414560601e-14 0.9999999999999994 1.5370078578076593e-16
