## **Preprocessing Data**

**Handling Missing Values**

In [12]:
import pandas as pd
data = {
    'Date':["2024-01-01","2024-01-02","2024-01-04","2024-01-05","2024-01-07","2024-01-08","2024-01-10","2024-01-13","2024-01-14"],
    'Price':[100, 101, 103, 104, 106, 107, 109, 112, 113]
}

data = pd.DataFrame(data)
data['Date'] = pd.to_datetime(data["Date"])
data.set_index("Date", inplace=True)
data

Unnamed: 0_level_0,Price
Date,Unnamed: 1_level_1
2024-01-01,100
2024-01-02,101
2024-01-04,103
2024-01-05,104
2024-01-07,106
2024-01-08,107
2024-01-10,109
2024-01-13,112
2024-01-14,113


In [13]:
# create a complete data range
complete_index = pd.date_range(start=data.index.min(), end=data.index.max(), freq="D")
# reindex the dataframe
data = data.reindex(complete_index)
data

Unnamed: 0,Price
2024-01-01,100.0
2024-01-02,101.0
2024-01-03,
2024-01-04,103.0
2024-01-05,104.0
2024-01-06,
2024-01-07,106.0
2024-01-08,107.0
2024-01-09,
2024-01-10,109.0


**Imputation**

In [14]:
# mean imputation
mean_value = data.mean()
data_mean_imputed = data.fillna(mean_value)

# median imputated
median_value = data.median()
data_median_imputed = data.fillna(median_value)

# mode imputation
mode_value = data.mode()
data_mode_imputed = data.fillna(mode_value.values[0][0])

# forward fill
data_forward_fill = data.ffill()

# backward fill
data_backward_fill = data.bfill()

print('Original Data:\n', data)

print('-----------------\nMean:', mean_value.values[0])
print('-----------------\nMean Imputed Data:\n', data_mean_imputed)

print('-----------------\nMedian:', median_value.values[0])
print('-----------------\nMedian Imputed Data:\n', data_median_imputed)

print('-----------------\nMode:', mode_value.values[0][0])
print('-----------------\nMode Imputed Data:\n', data_mode_imputed)

print('-----------------\nForward Fill:\n', data_forward_fill)
print('-----------------\nBackward Fill:\n', data_backward_fill)

Original Data:
             Price
2024-01-01  100.0
2024-01-02  101.0
2024-01-03    NaN
2024-01-04  103.0
2024-01-05  104.0
2024-01-06    NaN
2024-01-07  106.0
2024-01-08  107.0
2024-01-09    NaN
2024-01-10  109.0
2024-01-11    NaN
2024-01-12    NaN
2024-01-13  112.0
2024-01-14  113.0
-----------------
Mean: 106.11111111111111
-----------------
Mean Imputed Data:
                  Price
2024-01-01  100.000000
2024-01-02  101.000000
2024-01-03  106.111111
2024-01-04  103.000000
2024-01-05  104.000000
2024-01-06  106.111111
2024-01-07  106.000000
2024-01-08  107.000000
2024-01-09  106.111111
2024-01-10  109.000000
2024-01-11  106.111111
2024-01-12  106.111111
2024-01-13  112.000000
2024-01-14  113.000000
-----------------
Median: 106.0
-----------------
Median Imputed Data:
             Price
2024-01-01  100.0
2024-01-02  101.0
2024-01-03  106.0
2024-01-04  103.0
2024-01-05  104.0
2024-01-06  106.0
2024-01-07  106.0
2024-01-08  107.0
2024-01-09  106.0
2024-01-10  109.0
2024-01-11  106.0


**Interpolation**

In [15]:
# linear interpolation
data_linear_interpolation = data.interpolate(method="linear")

# polynomial interpolation
data_polynomial_interpolation = data.interpolate(method="polynomial", order=2)

# spline interpolation
data_spline_interpolation = data.interpolate(method="spline", order=2)

print('Original Data:\n', data)
print('\nLinear Interpolation:\n', data_linear_interpolation)
print('\nPolynomial Interpolation:\n', data_polynomial_interpolation)
print('\nSpline Interpolation:\n', data_spline_interpolation)

Original Data:
             Price
2024-01-01  100.0
2024-01-02  101.0
2024-01-03    NaN
2024-01-04  103.0
2024-01-05  104.0
2024-01-06    NaN
2024-01-07  106.0
2024-01-08  107.0
2024-01-09    NaN
2024-01-10  109.0
2024-01-11    NaN
2024-01-12    NaN
2024-01-13  112.0
2024-01-14  113.0

Linear Interpolation:
             Price
2024-01-01  100.0
2024-01-02  101.0
2024-01-03  102.0
2024-01-04  103.0
2024-01-05  104.0
2024-01-06  105.0
2024-01-07  106.0
2024-01-08  107.0
2024-01-09  108.0
2024-01-10  109.0
2024-01-11  110.0
2024-01-12  111.0
2024-01-13  112.0
2024-01-14  113.0

Polynomial Interpolation:
             Price
2024-01-01  100.0
2024-01-02  101.0
2024-01-03  102.0
2024-01-04  103.0
2024-01-05  104.0
2024-01-06  105.0
2024-01-07  106.0
2024-01-08  107.0
2024-01-09  108.0
2024-01-10  109.0
2024-01-11  110.0
2024-01-12  111.0
2024-01-13  112.0
2024-01-14  113.0

Spline Interpolation:
             Price
2024-01-01  100.0
2024-01-02  101.0
2024-01-03  102.0
2024-01-04  103.0
2024-01-

**Predictive Model**

In [16]:
data['Sales'] = [700, 900, 1000, 2000, 1500, 3000, 2300, 4000, 5900, 6000, 6500, 7000, 5700, 6300]
data

Unnamed: 0,Price,Sales
2024-01-01,100.0,700
2024-01-02,101.0,900
2024-01-03,,1000
2024-01-04,103.0,2000
2024-01-05,104.0,1500
2024-01-06,,3000
2024-01-07,106.0,2300
2024-01-08,107.0,4000
2024-01-09,,5900
2024-01-10,109.0,6000


In [17]:
from sklearn.linear_model import LinearRegression

known_data = data.dropna(subset='Price')
missing_data = data[data['Price'].isna()]

# train the model on known data
model = LinearRegression()
model.fit(known_data[['Sales']], known_data[['Price']])

# predict the missing values
predicted_values = model.predict(missing_data[['Sales']])
data.loc[data['Price'].isna(), 'Price'] = predicted_values
data

Unnamed: 0,Price,Sales
2024-01-01,100.0,700
2024-01-02,101.0,900
2024-01-03,101.723063,1000
2024-01-04,103.0,2000
2024-01-05,104.0,1500
2024-01-06,105.59487,3000
2024-01-07,106.0,2300
2024-01-08,107.0,4000
2024-01-09,111.208991,5900
2024-01-10,109.0,6000


**Resampling**

**1. Downsampling**

In [18]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import yfinance as yf

In [19]:
stock_data = yf.download('AAPL', start='2023-01-01')

[*********************100%***********************]  1 of 1 completed


In [23]:
stock_data['Close']

Ticker,AAPL
Date,Unnamed: 1_level_1
2023-01-03,123.632538
2023-01-04,124.907707
2023-01-05,123.583099
2023-01-06,128.130234
2023-01-09,128.654160
...,...
2025-02-10,227.649994
2025-02-11,232.619995
2025-02-12,236.869995
2025-02-13,241.529999


In [20]:
monthly_data = stock_data['Close'].resample('M').mean()
print("\nMonthly Downsampled Data:\n", monthly_data)


Monthly Downsampled Data:
 Ticker            AAPL
Date                  
2023-01-31  134.218446
2023-02-28  149.376950
2023-03-31  153.417609
2023-04-30  163.397963
2023-05-31  171.039937
2023-06-30  182.695794
2023-07-31  190.753942
2023-08-31  179.680440
2023-09-30  175.714644
2023-10-31  173.397760
2023-11-30  184.692049
2023-12-31  193.148854
2024-01-31  186.603930
2024-02-29  183.835622
2024-03-31  171.884888
2024-04-30  168.807459
2024-05-31  185.584776
2024-06-30  205.572247
2024-07-31  223.846307
2024-08-31  221.073106
2024-09-30  223.266615
2024-10-31  229.550871
2024-11-30  227.498904
2024-12-31  249.049032
2025-01-31  234.329359
2025-02-28  233.613246


**2. Upsampling Data**

In [21]:
hourly_data = stock_data['Close'].resample('H').interpolate(method='linear')
print("\nHourly Upsampled Data:\n", hourly_data)


Hourly Upsampled Data:
 Ticker                     AAPL
Date                           
2023-01-03 00:00:00  123.632538
2023-01-03 01:00:00  123.685670
2023-01-03 02:00:00  123.738802
2023-01-03 03:00:00  123.791934
2023-01-03 04:00:00  123.845066
...                         ...
2025-02-13 20:00:00  244.088338
2025-02-13 21:00:00  244.216255
2025-02-13 22:00:00  244.344172
2025-02-13 23:00:00  244.472089
2025-02-14 00:00:00  244.600006

[18553 rows x 1 columns]
