In [1]:
from pymongo import MongoClient
import pandas as pd
import warnings; warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import hvplot.pandas
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.svm import SVR
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures, StandardScaler
from sklearn.pipeline import make_pipeline
import numpy as np
import pickle
from sklearn.gaussian_process.kernels import ConstantKernel as C, RBF

# EXPENSE

In [2]:
client_ak = MongoClient('mongodb+srv://city_toronto:project3@cluster0.gt72z8e.mongodb.net/')
db_ak = client_ak['city_toronto']

collections_ak = db_ak['cat_expense_2014_2023']

# Convert MongoDB cursor to DataFrame
cursor = collections_ak.find()  # Assuming you want to retrieve all documents
merged_df_ak_final_exp = pd.DataFrame(list(cursor))

# Display the first few rows of the DataFrame
merged_df_ak_final_exp.drop(columns='_id', inplace=True)
merged_df_ak_final_exp = merged_df_ak_final_exp[['Category Name'] + [col for col in merged_df_ak_final_exp.columns if col != 'Category Name']]
merged_df_ak_final_exp

Unnamed: 0,Category Name,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Salaries And Benefits,5088.15,5306.23,5442.21,5526.44,5688.31,5928.05,6092.48,6232.59,6596.59,6899.58
1,Other Expenditures,2171.3,2180.79,2224.25,2378.67,2537.57,2711.14,2611.53,2835.06,2776.24,3063.18
2,Service And Rent,1703.93,1749.06,1756.78,1833.03,2026.52,2082.85,2178.26,2342.04,2632.93,3071.14
3,Contribution To Reserves/Reserve Funds,902.51,1028.17,1068.73,1265.31,1339.39,1374.33,1492.21,1569.5,679.06,817.74
4,Materials & Supplies,690.83,718.97,718.52,739.26,745.26,735.02,734.46,813.28,810.37,839.36
5,Contribution To Capital,245.46,245.83,220.98,296.06,354.83,343.45,343.46,349.54,347.76,345.49
6,Inter-Divisional Charges,332.35,296.07,298.82,301.55,308.22,316.15,156.99,0.0,312.48,319.73
7,Equipment,61.02,64.64,64.92,61.37,60.04,62.57,63.06,63.06,69.63,74.92


In [3]:
# Calculate year-wise total
sub_cat_exp_year = merged_df_ak_final_exp.drop(columns='Category Name').sum().reset_index()

# Rename columns
sub_cat_exp_year.columns = ['Year', 'Total']

# Display the result
sub_cat_exp_year_df = pd.DataFrame(sub_cat_exp_year)
#sub_cat_exp_year_df.to_csv('sub_cat_exp_year_data.csv', index=False)

In [4]:
sub_cat_exp_year_df

Unnamed: 0,Year,Total
0,2014,11195.55
1,2015,11589.76
2,2016,11795.21
3,2017,12401.69
4,2018,13060.14
5,2019,13553.56
6,2020,13672.45
7,2021,14205.07
8,2022,14225.06
9,2023,15431.14


# APPLYING REGRESSION ALGORITHMS FOR YEARLY DATA --> sub_cat_exp_year_df

# LINEAR REGRESSION - ACCURACY 90% APPROVED

In [5]:
# Prepare the data for linear regression
X = sub_cat_exp_year_df[['Year']]  # Features (year)
y = sub_cat_exp_year_df['Total']  # Target variable (millions)

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Save the trained model using pickle
with open('trained_models/linear_regression_model.pkl', 'wb') as f:
    pickle.dump(model, f)

print("Linear regression model saved successfully.")
# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("R-squared:", r2)

# Load the trained linear regression model from the pickle file
with open('trained_models/linear_regression_model.pkl', 'rb') as f:
    model = pickle.load(f)

# Generate data for the next 5 years
next_years = pd.DataFrame({'Year': range(2024, 2029)})

# Make predictions for the next 5 years
predictions = model.predict(next_years)

# Print or return the predictions
for year, prediction in zip(next_years['Year'], predictions):
    print(f"Year {year}: Expense Prediction {prediction:.2f} millions")

Linear regression model saved successfully.
Mean Absolute Error (MAE): 326.00159482759227
Mean Squared Error (MSE): 172295.84578163998
R-squared: 0.9007627608035115
Year 2024: Expense Prediction 15747.23 millions
Year 2025: Expense Prediction 16216.85 millions
Year 2026: Expense Prediction 16686.46 millions
Year 2027: Expense Prediction 17156.08 millions
Year 2028: Expense Prediction 17625.69 millions


# POLYNOMIAL REGRESSION - ACCURACY 95% APPROVED

In [6]:
from sklearn.linear_model import Ridge  # Add this import statement

# Load your data (Assuming you have loaded sub_cat_exp_year_df)

# Prepare the data for polynomial regression
X = sub_cat_exp_year_df[['Year']]  # Features (year)
y = sub_cat_exp_year_df['Total']  # Target variable (millions)

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the pipeline including Polynomial Features and Ridge Regression
model = make_pipeline(StandardScaler(), PolynomialFeatures(degree=3), Ridge(alpha=2.2))

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("R-squared:", r2)

# Save the trained model using pickle
with open('trained_models/polynomial_regression_model.pkl', 'wb') as f:
    pickle.dump(model, f)

print("Polynomial regression model saved successfully.")

# Load the trained polynomial regression model from the pickle file
with open('trained_models/polynomial_regression_model.pkl', 'rb') as f:
    model = pickle.load(f)

# Generate data for the next 5 years starting from 2025
next_years = pd.DataFrame({'Year': range(2025, 2030)})

# Make predictions for the next 5 years
predictions = model.predict(next_years)

# Print or return the predictions
for year, prediction in zip(next_years['Year'], predictions):
    print(f"Year {year}: Expense Prediction {prediction:.2f} millions")




Mean Absolute Error (MAE): 285.7672424209477
Mean Squared Error (MSE): 82575.55870116423
R-squared: 0.9524389550227662
Polynomial regression model saved successfully.
Year 2025: Expense Prediction 18435.68 millions
Year 2026: Expense Prediction 20738.64 millions
Year 2027: Expense Prediction 23671.27 millions
Year 2028: Expense Prediction 27315.98 millions
Year 2029: Expense Prediction 31755.19 millions


# Gaussian Process Regression--ACCURACY 82% NOT APPROVED

In [7]:

# Prepare the data for Gaussian Process Regression
X = sub_cat_exp_year_df[['Year']]  # Features (year)
y = sub_cat_exp_year_df['Total']  # Target variable (millions)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the kernel for Gaussian Process Regression
kernel = C(1.0, (1e-3, 1e3)) * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2))

# Initialize Gaussian Process Regression model
model = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=20, random_state=42)

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred, y_std = model.predict(X_test, return_std=True)

# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("R-squared:", r2)


# Save the trained model using pickle
with open('GaussianProcessRegressor_model.pkl', 'wb') as f:
    pickle.dump(model, f)

print("GaussianProcess regression model saved successfully.")


# Load the trained linear regression model from the pickle file
with open('GaussianProcessRegressor_model.pkl', 'rb') as f:
    model = pickle.load(f)

# Generate data for the next 5 years
next_years = pd.DataFrame({'Year': range(2024, 2029)})

# Make predictions for the next 5 years
predictions = model.predict(next_years)

# Print or return the predictions
for year, prediction in zip(next_years['Year'], predictions):
    print(f"Year {year}: Expense Prediction {prediction:.2f} millions")


Mean Absolute Error (MAE): 340.6940414323981
Mean Squared Error (MSE): 225863.9745037624
R-squared: 0.8193446107186365
GaussianProcess regression model saved successfully.
Year 2024: Expense Prediction 14521.77 millions
Year 2025: Expense Prediction 12090.44 millions
Year 2026: Expense Prediction 8762.37 millions
Year 2027: Expense Prediction 5498.34 millions
Year 2028: Expense Prediction 2985.44 millions


# SVM Regression--ACCURACY 98% NOT APPROVED

In [8]:

# Prepare the data for SVM Regression
X = sub_cat_exp_year_df[['Year']]  # Features (year)
y = sub_cat_exp_year_df['Total']  # Target variable (millions)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the SVM Regression model
model = SVR(kernel='rbf', C=1000, epsilon=0.1, gamma='scale')  # You can adjust these hyperparameters
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("R-squared:", r2)


# Save the trained model using pickle
with open('SVM_regression_model.pkl', 'wb') as f:
    pickle.dump(model, f)

print("SVM Regression model saved successfully.")

# Load the trained linear regression model from the pickle file
with open('SVM_regression_model.pkl', 'rb') as f:
    model = pickle.load(f)

# Generate data for the next 5 years
next_years = pd.DataFrame({'Year': range(2024, 2029)})

# Make predictions for the next 5 years
predictions = model.predict(next_years)

# Print or return the predictions
for year, prediction in zip(next_years['Year'], predictions):
    print(f"Year {year}: Expense Prediction {prediction:.2f} millions")

Mean Absolute Error (MAE): 131.59974004369965
Mean Squared Error (MSE): 21659.258140535403
R-squared: 0.9826760256055851
SVM Regression model saved successfully.
Year 2024: Expense Prediction 14052.39 millions
Year 2025: Expense Prediction 13670.60 millions
Year 2026: Expense Prediction 13326.65 millions
Year 2027: Expense Prediction 13108.21 millions
Year 2028: Expense Prediction 13004.76 millions


# APPLYING REGRESSION ALGORITHMS FOR CATEGORICAL DATA --> merged_df_ak_final_exp

Final 10 years expense data ---> merged_df_ak_final_exp

Calculationg the MAE/RMSE/R-squared to check model reliability
Considerign each category as a separate unit to do the calculations
And Calculating which model works best on each category

In [9]:
merged_df_ak_final_exp.columns.to_list()

['Category Name',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2020',
 '2021',
 '2022',
 '2023']

In [10]:
#Linear Regression___APPROVED FOR Materials & Supplies,Service And Rent,Other Expenditures,Salaries And Benefits

In [11]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Create empty list to store reliability scores for each category
reliability_scores_lr = []
# Create an empty DataFrame to store predictions
predictions_df_lr = pd.DataFrame(columns=['Category', 2024, 2025, 2026])

# Loop through each category
for category in merged_df_ak_final_exp['Category Name']:
    # Step 1: Extract data for the current category
    category_data = merged_df_ak_final_exp[merged_df_ak_final_exp['Category Name'] == category]
    
    # Step 2: Prepare data for linear regression
    years = category_data.columns[1:].astype(int).values
    expenses = category_data.iloc[0, 1:].values.astype(float)
    
    # Reshape the data for fitting into a linear regression model
    X = years.reshape(-1, 1)  # Features (years)
    y = expenses.reshape(-1, 1)  # Target variable (expenses)
    
    # Step 3: Train the linear regression model
    model = LinearRegression()
    model.fit(X, y)
    
    # Step 4: Make predictions for future years (2024 and beyond)
    future_years = np.array([2024, 2025, 2026]).reshape(-1, 1)
    predicted_expenses = model.predict(future_years)
    
    # Store predictions in DataFrame
    category_predictions_df = pd.DataFrame({
        'Category': [category],
        2024: predicted_expenses[0],
        2025: predicted_expenses[1],
        2026: predicted_expenses[2]
    })
    predictions_df_lr = pd.concat([predictions_df_lr, category_predictions_df], ignore_index=True)

    # Calculate reliability metrics
    y_pred = model.predict(X)
    mae = mean_absolute_error(y, y_pred)
    rmse = mean_squared_error(y, y_pred, squared=False)
    r2 = r2_score(y, y_pred)
    
    # Append reliability scores to the list
    reliability_scores_lr.append({
        'Algorithm': 'Linear Regression',
        'Category': category,
        'MAE': mae,
        'RMSE': rmse,
        'R-squared': r2
        
    })

# Convert the list of dictionaries to a DataFrame
reliability_scores_lr_df = pd.DataFrame(reliability_scores_lr)



In [12]:
reliability_scores_lr_df


Unnamed: 0,Algorithm,Category,MAE,RMSE,R-squared
0,Linear Regression,Salaries And Benefits,81.110667,89.383075,0.9736714
1,Linear Regression,Other Expenditures,65.824667,73.026393,0.9365809
2,Linear Regression,Service And Rent,121.2888,147.294932,0.8761976
3,Linear Regression,Contribution To Reserves/Reserve Funds,254.414242,284.133715,6.140183e-07
4,Linear Regression,Materials & Supplies,14.182218,19.121886,0.8298311
5,Linear Regression,Contribution To Capital,21.204436,27.608341,0.6938131
6,Linear Regression,Inter-Divisional Charges,66.766836,94.045861,0.1119891
7,Linear Regression,Equipment,2.982788,3.298336,0.4111651


In [13]:
predictions_df_lr

Unnamed: 0,Category,2024,2025,2026
0,Salaries And Benefits,6920.901333,7110.144667,7299.388
1,Other Expenditures,3086.348667,3184.053333,3281.758
2,Service And Rent,2887.997333,3024.423394,3160.849455
3,Contribution To Reserves/Reserve Funds,1154.121333,1154.198848,1154.276364
4,Materials & Supplies,835.390667,850.092061,864.793455
5,Contribution To Capital,388.866,403.335091,417.804182
6,Inter-Divisional Charges,200.284,188.656364,177.028727
7,Equipment,69.800667,70.760242,71.719818


In [14]:
#GaussianProcessRegressor___APPROVED FOR Salaries And Benefits,Service And Rent,Contribution To Capital,Other Expenditures,Materials & Supplies

In [15]:
# Create empty DataFrames to store predictions and reliability scores
predictions_df_gpr = pd.DataFrame(columns=['Category'])
reliability_scores_gpr = []

# Loop through each category
for category in merged_df_ak_final_exp['Category Name']:
    # Step 1: Extract data for the current category
    category_data = merged_df_ak_final_exp[merged_df_ak_final_exp['Category Name'] == category]
    
    # Step 2: Prepare data for Gaussian Process Regression
    years = category_data.columns[1:].astype(int).values
    expenses = category_data.iloc[0, 1:].values.astype(float)
    
    # Reshape the data for fitting into the Gaussian Process Regression model
    X = years.reshape(-1, 1)  # Features (years)
    y = expenses  # Target variable (expenses) - no need to reshape
    
    # Step 3: Define the kernel for Gaussian Process Regression
    kernel = 1.0 * RBF(length_scale=1.0)
    
    # Step 4: Train the Gaussian Process Regression model
    model_gpr = GaussianProcessRegressor(kernel=kernel, alpha=0.05, normalize_y=True)
    model_gpr.fit(X, y)
    
    # Step 5: Make predictions for future years (2024 and beyond) using the Gaussian Process Regression model
    future_years = np.array([2024, 2025, 2026]).reshape(-1, 1)
    predicted_expenses_gpr, _ = model_gpr.predict(future_years, return_std=True)
    
    # Calculate reliability scores
    mae = mean_absolute_error(expenses, model_gpr.predict(X))
    rmse = mean_squared_error(expenses, model_gpr.predict(X), squared=False)
    r2 = r2_score(expenses, model_gpr.predict(X))
    
    # Append reliability scores to the list
    reliability_scores_gpr.append({
        'Algorithm': 'Gaussian Process Regressor','Category': category, 'MAE': mae, 'RMSE': rmse, 'R-squared': r2})
    
    # Create a DataFrame to store predictions for the current category
    category_predictions_df = pd.DataFrame({
        'Category': [category],
        '2024': predicted_expenses_gpr[0],
        '2025': predicted_expenses_gpr[1],
        '2026': predicted_expenses_gpr[2]
    })
    
    # Append the predictions to the overall predictions DataFrame
    predictions_df_gpr = pd.concat([predictions_df_gpr, category_predictions_df], ignore_index=True)

# Convert the list of reliability scores into a DataFrame
reliability_scores_gpr_df = pd.DataFrame(reliability_scores_gpr)

# Print the predictions DataFrame
predictions_df_gpr




Unnamed: 0,Category,2024,2025,2026
0,Salaries And Benefits,7004.800391,7199.717177,7372.706126
1,Other Expenditures,3051.203122,3112.421138,3159.840872
2,Service And Rent,3219.556801,3446.216374,3609.25433
3,Contribution To Reserves/Reserve Funds,1106.459278,1151.480443,1153.668087
4,Materials & Supplies,812.727446,769.505613,755.903922
5,Contribution To Capital,332.250888,318.788153,311.511685
6,Inter-Divisional Charges,269.104905,264.362775,264.236756
7,Equipment,73.084922,68.505916,65.612151


In [16]:
reliability_scores_gpr_df = reliability_scores_gpr_df.sort_values(by='R-squared', ascending=False)
reliability_scores_gpr_df

Unnamed: 0,Algorithm,Category,MAE,RMSE,R-squared
6,Gaussian Process Regressor,Inter-Divisional Charges,3.327759,5.558897,0.996897
3,Gaussian Process Regressor,Contribution To Reserves/Reserve Funds,11.677379,17.260754,0.99631
7,Gaussian Process Regressor,Equipment,0.245632,0.275067,0.995905
4,Gaussian Process Regressor,Materials & Supplies,4.356209,4.882896,0.988904
0,Gaussian Process Regressor,Salaries And Benefits,50.717875,61.23764,0.987642
5,Gaussian Process Regressor,Contribution To Capital,4.610174,5.82222,0.986383
2,Gaussian Process Regressor,Service And Rent,48.031897,60.789769,0.978913
1,Gaussian Process Regressor,Other Expenditures,63.789283,72.704672,0.937138


In [17]:
#SVM regression

In [18]:
# Create empty DataFrames to store predictions and reliability scores
predictions_df_svm = pd.DataFrame(columns=['Category'])
reliability_scores_svm = []

# Loop through each category
for category in merged_df_ak_final_exp['Category Name']:
    # Step 1: Extract data for the current category
    category_data = merged_df_ak_final_exp[merged_df_ak_final_exp['Category Name'] == category]
    
    # Step 2: Prepare data for SVM regression
    years = category_data.columns[1:].astype(int).values
    expenses = category_data.iloc[0, 1:].values.astype(float)
    
    # Reshape the data for fitting into the SVM model
    X = years.reshape(-1, 1)  # Features (years)
    y = expenses.reshape(-1, 1)  # Target variable (expenses)
    
    # Step 3: Train the SVM regression model
    model_svm = SVR(kernel='rbf', C=500, gamma='auto')  # You can adjust parameters like C and gamma
    model_svm.fit(X, y.ravel())
    
    # Step 4: Make predictions for the training data
    y_pred = model_svm.predict(X)
    
    # Calculate performance metrics
    mae = mean_absolute_error(y, y_pred)
    rmse = mean_squared_error(y, y_pred, squared=False)
    r2 = r2_score(y, y_pred)
    
    # Append reliability scores to the list
    reliability_scores_svm.append({'Algorithm': 'SVM regression','Category': category, 'MAE': mae, 'RMSE': rmse, 'R-squared': r2})

    # Convert the list of dictionaries to a DataFrame
    reliability_scores_df_svm = pd.DataFrame(reliability_scores_svm)
    
    # Step 5: Make predictions for future years (2024 and beyond) using the SVM model
    future_years = np.array([2024, 2025, 2026]).reshape(-1, 1)
    predicted_expenses_svm = model_svm.predict(future_years)
    
    # Print predicted expenses for future years using the SVM model
    #for year, expense in zip(future_years.flatten(), predicted_expenses_svm):
        #print(f"Predicted expense for year {year} in category {category}: {expense:.2f}")
    
    # Append the predictions to the overall predictions DataFrame
    category_predictions_df = pd.DataFrame({
        'Category': [category],
        '2024': predicted_expenses_svm[0],
        '2025': predicted_expenses_svm[1],
        '2026': predicted_expenses_svm[2]
    })
    predictions_df_svm = pd.concat([predictions_df_svm, category_predictions_df], ignore_index=True)



In [19]:
# Print the predictions DataFrame
predictions_df_svm



Unnamed: 0,Category,2024,2025,2026
0,Salaries And Benefits,6054.304746,5870.412202,5861.254426
1,Other Expenditures,2734.310199,2561.107581,2552.02243
2,Service And Rent,2298.51647,2118.201979,2109.068438
3,Contribution To Reserves/Reserve Funds,1076.823982,1137.460985,1140.208111
4,Materials & Supplies,785.955787,756.847666,755.331536
5,Contribution To Capital,319.555796,308.752765,308.205856
6,Inter-Divisional Charges,269.221937,266.981814,266.999379
7,Equipment,68.207753,64.921862,64.752695


In [20]:
# Print the reliability scores DataFrame
reliability_scores_df_svm

Unnamed: 0,Algorithm,Category,MAE,RMSE,R-squared
0,SVM regression,Salaries And Benefits,52.284006,122.979971,0.950159
1,SVM regression,Other Expenditures,3.837483,11.850501,0.99833
2,SVM regression,Service And Rent,34.961522,110.273991,0.93061
3,SVM regression,Contribution To Reserves/Reserve Funds,16.176244,36.430811,0.98356
4,SVM regression,Materials & Supplies,0.094281,0.095775,0.999996
5,SVM regression,Contribution To Capital,0.100106,0.100107,0.999996
6,SVM regression,Inter-Divisional Charges,0.100028,0.100028,0.999999
7,SVM regression,Equipment,0.100082,0.100082,0.999458


# REVENUE

In [21]:
client_ak = MongoClient('mongodb+srv://city_toronto:project3@cluster0.gt72z8e.mongodb.net/')
db_ak = client_ak['city_toronto']

collections_ak_rev = db_ak['cat_revenue_2014_2023']

# Convert MongoDB cursor to DataFrame
cursor = collections_ak_rev.find()  # Assuming you want to retrieve all documents
merged_df_ak_final_rev = pd.DataFrame(list(cursor))

# Display the first few rows of the DataFrame
merged_df_ak_final_rev.drop(columns='_id', inplace=True)
merged_df_ak_final_rev = merged_df_ak_final_rev[['Category Name'] + [col for col in merged_df_ak_final_rev.columns if col != 'Category Name']]


In [22]:
# Calculate year-wise total
cat_rev_year = merged_df_ak_final_rev.drop(columns='Category Name').sum().reset_index()

# Rename columns
cat_rev_year.columns = ['Year', 'Total']

# Display the result
cat_rev_year_df = pd.DataFrame(cat_rev_year)
cat_rev_year_df.to_csv('cat_rev_year_data.csv', index=False)
cat_rev_year_df

Unnamed: 0,Year,Total
0,2014,10993.58
1,2015,11463.67
2,2016,11683.36
3,2017,12286.6
4,2018,12933.14
5,2019,13414.57
6,2020,13530.82
7,2021,14205.05
8,2022,15228.41
9,2023,16467.96


# # APPLYING REGRESSION ALGORITHMS FOR YEARLY DATA --> cat_rev_year_df

# Linear Regression- Accuracy 98%--APPROVED

In [23]:
# Prepare the data for linear regression
X = cat_rev_year_df[['Year']]  # Features (year)
y = cat_rev_year_df['Total']  # Target variable (millions)

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Save the trained model using pickle
with open('trained_models/linear_regression_model_rev.pkl', 'wb') as f:
    pickle.dump(model, f)

print("Linear regression model saved successfully.")

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("R-squared:", r2)



Linear regression model saved successfully.
Mean Absolute Error (MAE): 156.65499999987514
Mean Squared Error (MSE): 37036.44397173323
R-squared: 0.9895475211826709


In [24]:
# Load the trained linear regression model from the pickle file
with open('trained_models/linear_regression_model_rev.pkl', 'rb') as f:
    model = pickle.load(f)

# Generate data for the next 5 years
next_years = pd.DataFrame({'Year': range(2024, 2029)})

# Make predictions for the next 5 years
predictions = model.predict(next_years)

# Print or return the predictions
for year, prediction in zip(next_years['Year'], predictions):
    print(f"Year {year}: Revenue Prediction {prediction:.2f} millions")

Year 2024: Revenue Prediction 16323.06 millions
Year 2025: Revenue Prediction 16892.81 millions
Year 2026: Revenue Prediction 17462.57 millions
Year 2027: Revenue Prediction 18032.33 millions
Year 2028: Revenue Prediction 18602.09 millions


# GaussianProcessRegressor- Accuracy 98%-NOT APPROVED

In [25]:

# Prepare the data for Gaussian Process Regression
X = cat_rev_year_df[['Year']]  # Features (year)
y = cat_rev_year_df['Total']  # Target variable (millions)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the kernel for Gaussian Process Regression
kernel = C(1.0, (1e-3, 1e3)) * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2))

# Initialize Gaussian Process Regression model
model = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=20, random_state=42)

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred, y_std = model.predict(X_test, return_std=True)

# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("R-squared:", r2)


# Save the trained model using pickle
with open('GaussianProcessRegressor_model_rev.pkl', 'wb') as f:
    pickle.dump(model, f)

print("GaussianProcess regression model saved successfully.")


Mean Absolute Error (MAE): 148.26295168010742
Mean Squared Error (MSE): 24128.0223210164
R-squared: 0.9897903426067576
GaussianProcess regression model saved successfully.


In [26]:
# Load the trained linear regression model from the pickle file
with open('GaussianProcessRegressor_model_rev.pkl', 'rb') as f:
    model = pickle.load(f)

# Generate data for the next 5 years
next_years = pd.DataFrame({'Year': range(2024, 2029)})

# Make predictions for the next 5 years
predictions = model.predict(next_years)

# Print or return the predictions
for year, prediction in zip(next_years['Year'], predictions):
    print(f"Year {year}: Revenue Prediction {prediction:.2f} millions")

Year 2024: Revenue Prediction 16143.58 millions
Year 2025: Revenue Prediction 13957.02 millions
Year 2026: Revenue Prediction 10445.92 millions
Year 2027: Revenue Prediction 6726.58 millions
Year 2028: Revenue Prediction 3723.86 millions


# SVM Regression  Accuracy 98%--NOT APPROVED

In [27]:

# Prepare the data for SVM Regression
X = cat_rev_year_df[['Year']]  # Features (year)
y = cat_rev_year_df['Total']  # Target variable (millions)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the SVM Regression model
model = SVR(kernel='rbf', C=10000, epsilon=0.1, gamma='scale')  # You can adjust these hyperparameters
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("R-squared:", r2)


# Save the trained model using pickle
with open('SVM_regression_model_rev.pkl', 'wb') as f:
    pickle.dump(model, f)

print("SVM Regression model saved successfully.")



Mean Absolute Error (MAE): 193.66986624343068
Mean Squared Error (MSE): 40460.76012457211
R-squared: 0.9828792226214815
SVM Regression model saved successfully.


In [28]:
# Load the trained linear regression model from the pickle file
with open('SVM_regression_model_rev.pkl', 'rb') as f:
    model = pickle.load(f)

# Generate data for the next 5 years
next_years = pd.DataFrame({'Year': range(2024, 2029)})

# Make predictions for the next 5 years
predictions = model.predict(next_years)

# Print or return the predictions
for year, prediction in zip(next_years['Year'], predictions):
    print(f"Year {year}: Revenue Prediction {prediction:.2f} millions")

Year 2024: Revenue Prediction 16625.68 millions
Year 2025: Revenue Prediction 15886.07 millions
Year 2026: Revenue Prediction 14857.15 millions
Year 2027: Revenue Prediction 14072.92 millions
Year 2028: Revenue Prediction 13663.46 millions


# POLYNOMIAL REGRESSION-99% APPROVED

In [29]:

# Prepare the data for polynomial regression
X = cat_rev_year_df[['Year']]  # Features (year)
y = cat_rev_year_df['Total']  # Target variable (millions)

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the pipeline including Polynomial Features and Ridge Regression
model_poly_rev = make_pipeline(StandardScaler(), PolynomialFeatures(degree=2), Ridge(alpha=0.25))

# Train the model
model_poly_rev.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model_poly_rev.predict(X_test)

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("R-squared:", r2)

# Save the trained model using pickle
with open('trained_models/polynomial_regression_model_rev.pkl', 'wb') as f:
    pickle.dump(model_poly_rev, f)

print("Polynomial regression model saved successfully.")

# Load the trained polynomial regression model from the pickle file
with open('trained_models/polynomial_regression_model.pkl', 'rb') as f:
    model_poly_rev = pickle.load(f)

# Generate data for the next 5 years starting from 2025
next_years = pd.DataFrame({'Year': range(2025, 2030)})

# Make predictions for the next 5 years
predictions = model_poly_rev.predict(next_years)

# Print or return the predictions
for year, prediction in zip(next_years['Year'], predictions):
    print(f"Year {year}: Expense Prediction {prediction:.2f} millions")




Mean Absolute Error (MAE): 51.355057471265354
Mean Squared Error (MSE): 2992.66552691749
R-squared: 0.9991554056039686
Polynomial regression model saved successfully.
Year 2025: Expense Prediction 18435.68 millions
Year 2026: Expense Prediction 20738.64 millions
Year 2027: Expense Prediction 23671.27 millions
Year 2028: Expense Prediction 27315.98 millions
Year 2029: Expense Prediction 31755.19 millions
