In [17]:
import pandas as pd
import numpy as np
import warnings
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

In [18]:
# Preprocess the dataset
df = pd.read_csv("31-40.csv") 
df["cit_2017"] = df["cit_2017"].astype(int)
df_lin = df.copy()
df_log = df.copy()

In [19]:
#classify a numerical value into categories ( Low: 0, Medium: 1, High: 2 )

label_values = []
for i in range(len(df)):
    value = df_log['cit_2022'][i]/df_log['cit_2021'][i]
    if value < 1.05:
        label_values.append(0)
    elif value > 1.06 and value < 1.15:
        label_values.append(1)
    else:
        label_values.append(2)

df_log['Label'] = label_values

data_log = df_log[['cit_2017', 'cit_2018', 'cit_2019', 'cit_2020', 'cit_2021','cit_2022']].copy()
label_log = df_log['Label'].copy()


# For Regression
data_lin = df_lin[['cit_2017', 'cit_2018', 'cit_2019', 'cit_2020', 'cit_2021']].copy()
label_lin = df_lin['cit_2022'].copy()

In [20]:
# Scaling


# Train-Test Split (80-20)
# Classification
X_train_c, X_test_c, Y_train_c, Y_test_c = train_test_split(data_log, label_log, test_size=0.2, random_state=9)
scaler_x_c = MinMaxScaler()
X_train_scaled_c = scaler_x_c.fit_transform(X_train_c)
X_test_scaled_c = scaler_x_c.transform(X_test_c)  



In [21]:
# Regression
X_train_r, X_test_r, Y_train_r, Y_test_r = train_test_split(data_lin, label_lin, test_size=0.2, random_state=9)
scaler_x_r = MinMaxScaler()
X_train_scaled_r = scaler_x_r.fit_transform(X_train_r)  
X_test_scaled_r = scaler_x_r.transform(X_test_r) 

In [22]:
# Performance
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_absolute_error, accuracy_score

# Train the model on the dataset ( Logistic Regression )
LogR = LogisticRegression(max_iter=100) 
LogR.fit(X_train_scaled_c, Y_train_c)
pred_log = LogR.predict(X_test_scaled_c)
accuracy_log = accuracy_score(Y_test_c, pred_log)
print(f'Logistic Regression Accuracy: {accuracy_log*100:.2f}%')

# Train the model on the dataset ( Linear Regression )
LinR = LinearRegression()
LinR.fit(X_train_scaled_r, Y_train_r)
pred_lin = LinR.predict(X_test_scaled_r)
mse_lin = mean_absolute_error(Y_test_r, pred_lin)
print(f'Linear Regression MAE: {mse_lin:.2f}')

Logistic Regression Accuracy: 60.00%
Linear Regression MAE: 81.12


In [23]:
# function Truncate prediction values
def trunc(values, decs=0):
    return np.trunc(values*10**decs)/(10**decs)

pred = trunc(pred_lin, decs=2)

In [24]:
# Create output DataFrames
linear_reg_op = X_test_r.copy()
linear_reg_op["Actual_cit_2022"] = Y_test_r
linear_reg_op["Predicted_cit_2022"] = pred
linear_reg_op

Unnamed: 0,cit_2017,cit_2018,cit_2019,cit_2020,cit_2021,Actual_cit_2022,Predicted_cit_2022
75,105,127,80,95,91,84,96.89
42,292,510,672,928,1289,1456,1587.74
46,125,143,190,234,291,275,358.8
68,192,206,189,208,213,253,229.31
3,46,76,75,67,59,58,59.47
39,92,107,137,92,98,121,101.2
23,701,683,773,813,1023,1039,1188.09
20,259,248,177,149,167,151,173.8
70,255,238,204,194,184,155,184.56
73,100,109,170,169,148,184,153.51


In [25]:
# Create output DataFrames
logistic_reg_op = X_test_c.copy()
logistic_reg_op["Actual_label"] = Y_test_c
logistic_reg_op["Predicted_label"] = pred_log
logistic_reg_op

Unnamed: 0,cit_2017,cit_2018,cit_2019,cit_2020,cit_2021,cit_2022,Actual_label,Predicted_label
75,105,127,80,95,91,84,0,0
42,292,510,672,928,1289,1456,1,0
46,125,143,190,234,291,275,0,0
68,192,206,189,208,213,253,2,0
3,46,76,75,67,59,58,0,0
39,92,107,137,92,98,121,2,0
23,701,683,773,813,1023,1039,0,0
20,259,248,177,149,167,151,0,0
70,255,238,204,194,184,155,0,0
73,100,109,170,169,148,184,2,0


In [26]:
# Save to CSV files
linear_reg_op.to_csv('./linear_reg_op.csv', index=False)
logistic_reg_op.to_csv('./logistic_reg_op.csv', index=False)