In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression

# Read the datasets for 2021 and 2022
df_2021 = pd.read_parquet("Dataset/2021.parquet")
df_2022 = pd.read_parquet("Dataset/2022.parquet")

def preprocess_data(df, year):
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
    daily_data = df.groupby(df['pickup_datetime'].dt.date)[['driver_pay', 'trip_miles']].sum().reset_index()
    daily_data['year'] = year
    return daily_data

# Preprocess the data for each year
data_2021 = preprocess_data(df_2021, 2021)
data_2022 = preprocess_data(df_2022, 2022)

# Concatenate the data for both years
combined_data = pd.concat([data_2021, data_2022], ignore_index=True)

# Calculate the average for each day across both years
average_data = combined_data.groupby(combined_data['pickup_datetime']).mean().reset_index()

# Create a date range for the year 2023
date_range_2023 = pd.date_range(start='2023-01-01', end='2023-12-31', freq='D')

# Repeat the average data for each day in 2023
predictions_2023 = pd.DataFrame({'pickup_datetime': date_range_2023})
predictions_2023 = pd.concat([predictions_2023] * len(average_data), ignore_index=True)

# Assign the average values to the 'driver_pay' and 'trip_miles' columns
predictions_2023['driver_pay'] = average_data['driver_pay'].repeat(len(date_range_2023))
predictions_2023['trip_miles'] = average_data['trip_miles'].repeat(len(date_range_2023))

# Train a Linear Regression model
X = combined_data[combined_data.drop['driver_pay', 'trip_miles']] # The independent variables
y = combined_data['driver_pay', 'trip_miles']  # Assuming the driver_pay and trip_miles is the target variable for prediction

model = LinearRegression()
model.fit(X, y)

# Predict the year for 2023 based on driver_pay and trip_miles
X_2023 = predictions_2023[['driver_pay', 'trip_miles']]
predictions_2023['predicted_year'] = model.predict(X_2023)

# Save the predictions to a Parquet file
predictions_2023.to_parquet("Dataset/2023.parquet")