In [None]:
import polars as pl
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

In [163]:
df = pl.read_json('../dataset/untitled.json', infer_schema_length=None)

In [164]:
#display width and height of the dataset
df.shape

(100000, 85)

In [165]:
#select features like meterCategory, status, paymentChannel, meterDistrict, meterRegion, amount, accountNumber
#for analysis
df = df.select(pl.col('meterCategory', 'status', 'paymentChannel', 'meterRegion', 'meterDistrict', 'amount', 'accountNumber', 'createdAt')).filter(pl.col('status') == 'success')

In [166]:
df = df.drop_nulls()

In [167]:
#display number of empty cells in the dataset
df.null_count()

meterCategory,status,paymentChannel,meterRegion,meterDistrict,amount,accountNumber,createdAt
u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0


In [168]:
#drop rows with no data values
df = df.filter(~pl.all_horizontal(pl.all().is_null()))

In [169]:
#convert createdAt str to datetime
df = df.with_columns(pl.col("createdAt")
    .str.slice(0, 26)  # Keep up to microseconds: "DD-MM-YYYY HH:MM:SS.ssssss"
    .str.strptime(pl.Datetime, "%d-%m-%Y %H:%M:%S%.f"),  # Parse as datetime
)


In [170]:
#display least amount of transaction 
df.select(pl.col('amount')).min()

amount
f64
0.1


In [171]:
#display total revenue for the dataset
df.select(pl.col('amount').sum())

amount
f64
10510000.0


In [172]:
#create a new field for hour from createdAt
df = df.with_columns(pl.col('createdAt').dt.hour().alias('h'))

#get the total amount for each hour
df = df.group_by('h').agg(
    pl.col("amount").sum().alias("y")  # Sum amounts for each hour
).sort(['h', 'y'])

In [None]:
df = df.with_columns(pl.col('y').shift(1).alias('lag1'), 
                     pl.col('y').shift(1).alias('lag2'),
                     pl.col('y').shift(1).alias('lag3'))
df

h,y,lag1,lag2,lag3
i8,f64,f64,f64,f64
0,103082.32,,,
1,121367.0,103082.32,103082.32,103082.32
2,83712.910004,121367.0,121367.0,121367.0
3,58158.649997,83712.910004,83712.910004,83712.910004
4,57942.110054,58158.649997,58158.649997,58158.649997
…,…,…,…,…
17,192255.500009,184529.5,184529.5,184529.5
18,231874.600006,192255.500009,192255.500009,192255.500009
19,193449.309998,231874.600006,231874.600006,231874.600006
20,167228.1,193449.309998,193449.309998,193449.309998


In [185]:
X = df.select(['lag1', 'lag2', 'lag3']).to_numpy()
Y = df.select('y').to_numpy()

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5, shuffle=False)

In [None]:
#train model
model1 = RandomForestRegressor(42)
model1.fit(X_train, Y_train)

In [190]:
#prediction
Y_prediction = model1.predict(X_test)
Y_prediction

array([1181344.4446448 , 1181344.4446448 , 1014855.0890596 ,
        179704.27362924,  288763.07025375,  288763.07025375,
        288763.07025375,  288763.07025375,  179704.27362924,
        288763.07025375,  279687.60454356])