In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np

In [2]:
data=pd.read_csv('processed_dataset.csv')

data

Unnamed: 0,city,population,total_cases,carried_over_from,opened_inside_year,finished,carried_over_to,time_since_event,population_density
0,İstanbul,15099946.0,2378387,1536743,841644,836525,1541862,3,2765.051456
1,Tekirdağ,1105759.0,127163,59444,67719,62603,64560,3,178.636349
2,Edirne,406215.0,50363,24888,25475,27064,23299,3,66.104963
3,Kırklareli,363245.0,31716,12732,18984,18939,12777,3,56.238582
4,Balıkesir,1238619.0,139589,66047,73542,74267,65322,3,84.935816
...,...,...,...,...,...,...,...,...,...
1048,Diyarbakır,1514793.0,89211,47222,41989,38964,50247,15,100.310774
1049,Mardin,736455.0,24841,7067,17774,15843,8998,15,83.878702
1050,Batman,497841.0,22390,8826,13564,13984,8406,15,111.199687
1051,Şırnak,430006.0,13615,3692,9923,9707,3908,15,60.752472


In [3]:
# Drop non-feature columns from the DataFrame
X = data.drop(columns=['opened_inside_year','carried_over_from','finished','carried_over_to','total_cases'])

# Separate the target variable
y = data['opened_inside_year']

print(X)
print(y)

# Split data into training and testing sets
# First 81 row is the year 2021
X_train, X_test, y_train, y_test = X.iloc[81:], X.iloc[:81], y.iloc[81:], y.iloc[:81]

            city  population  time_since_event  population_density
0       İstanbul  15099946.0                 3         2765.051456
1       Tekirdağ   1105759.0                 3          178.636349
2         Edirne    406215.0                 3           66.104963
3     Kırklareli    363245.0                 3           56.238582
4      Balıkesir   1238619.0                 3           84.935816
...          ...         ...               ...                 ...
1048  Diyarbakır   1514793.0                15          100.310774
1049      Mardin    736455.0                15           83.878702
1050      Batman    497841.0                15          111.199687
1051      Şırnak    430006.0                15           60.752472
1052       Siirt    303599.0                15           53.104600

[1053 rows x 4 columns]
0       841644
1        67719
2        25475
3        18984
4        73542
         ...  
1048     41989
1049     17774
1050     13564
1051      9923
1052      7862
Name: 

In [4]:
# Define the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('robust_scaler', RobustScaler(), ['population', 'population_density']),
        ('standart_scaler', StandardScaler(), ['time_since_event']),
        ('cat', OneHotEncoder(), ['city'])
    ]
)

In [5]:
# Define the pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor())
])

# Define grid search parameters
param_grid = {
    'regressor__max_depth': [10, 20, 30, None],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

# Perform grid search
grid_search = GridSearchCV(model, param_grid, cv=5, verbose=2)
grid_search.fit(X_train, y_train)
best_model_decision_tree = grid_search.best_estimator_

# Predict on test set
y_pred = best_model_decision_tree.predict(X_test)

# Print predicted and real y-values for the first 10 rows
for i in range(10):
    print("Predicted:", y_pred[i], "\tReal:", y_test.iloc[i])

# Calculate absolute percentage error for each prediction
absolute_percentage_errors = np.abs((y_test - y_pred) / y_test)

# Calculate mean absolute percentage error
mape = np.mean(absolute_percentage_errors)

# Convert MAPE to accuracy (accuracy = 1 - MAPE)
accuracy = 1 - mape

# Convert accuracy to percentage
percentage_accuracy = accuracy * 100

print("Average Percentage Accuracy:", percentage_accuracy)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END regressor__max_depth=10, regressor__min_samples_leaf=1, regressor__min_samples_split=2; total time=   0.0s
[CV] END regressor__max_depth=10, regressor__min_samples_leaf=1, regressor__min_samples_split=2; total time=   0.0s
[CV] END regressor__max_depth=10, regressor__min_samples_leaf=1, regressor__min_samples_split=2; total time=   0.0s
[CV] END regressor__max_depth=10, regressor__min_samples_leaf=1, regressor__min_samples_split=2; total time=   0.0s
[CV] END regressor__max_depth=10, regressor__min_samples_leaf=1, regressor__min_samples_split=2; total time=   0.0s
[CV] END regressor__max_depth=10, regressor__min_samples_leaf=1, regressor__min_samples_split=5; total time=   0.0s
[CV] END regressor__max_depth=10, regressor__min_samples_leaf=1, regressor__min_samples_split=5; total time=   0.0s
[CV] END regressor__max_depth=10, regressor__min_samples_leaf=1, regressor__min_samples_split=5; total time=   0.0s
[CV] END r

In [6]:
# Define the pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Define grid search parameters
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10]
}

# Perform grid search
grid_search = GridSearchCV(model, param_grid, cv=5, verbose=2)
grid_search.fit(X_train, y_train)
best_model_random_forest = grid_search.best_estimator_

# Predict on test set
y_pred = best_model_random_forest.predict(X_test)

# Print predicted and real y-values for the first 10 rows
for i in range(10):
    print("Predicted:", y_pred[i], "\tReal:", y_test.iloc[i])

# Calculate absolute percentage error for each prediction
absolute_percentage_errors = np.abs((y_test - y_pred) / y_test)

# Calculate mean absolute percentage error
mape = np.mean(absolute_percentage_errors)

# Convert MAPE to accuracy (accuracy = 1 - MAPE)
accuracy = 1 - mape

# Convert accuracy to percentage
percentage_accuracy = accuracy * 100

print("Average Percentage Accuracy:", percentage_accuracy)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END regressor__max_depth=None, regressor__min_samples_split=2, regressor__n_estimators=100; total time=   1.0s
[CV] END regressor__max_depth=None, regressor__min_samples_split=2, regressor__n_estimators=100; total time=   1.0s
[CV] END regressor__max_depth=None, regressor__min_samples_split=2, regressor__n_estimators=100; total time=   1.0s
[CV] END regressor__max_depth=None, regressor__min_samples_split=2, regressor__n_estimators=100; total time=   1.0s
[CV] END regressor__max_depth=None, regressor__min_samples_split=2, regressor__n_estimators=100; total time=   1.0s
[CV] END regressor__max_depth=None, regressor__min_samples_split=2, regressor__n_estimators=200; total time=   2.1s
[CV] END regressor__max_depth=None, regressor__min_samples_split=2, regressor__n_estimators=200; total time=   2.1s
[CV] END regressor__max_depth=None, regressor__min_samples_split=2, regressor__n_estimators=200; total time=   2.1s
[CV] END r

In [7]:
# Define the pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor())
])

# Define grid search parameters
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__learning_rate': [0.05, 0.1, 0.2],
    'regressor__max_depth': [3, 4, 5]
}

# Perform grid search
grid_search = GridSearchCV(model, param_grid, cv=5, verbose=2)
grid_search.fit(X_train, y_train)
best_model_gradient_boosting = grid_search.best_estimator_

# Predict on test set
y_pred = best_model_gradient_boosting.predict(X_test)

# Print predicted and real y-values for the first 10 rows
for i in range(10):
    print("Predicted:", y_pred[i], "\tReal:", y_test.iloc[i])

# Calculate absolute percentage error for each prediction
absolute_percentage_errors = np.abs((y_test - y_pred) / y_test)

# Calculate mean absolute percentage error
mape = np.mean(absolute_percentage_errors)

# Convert MAPE to accuracy (accuracy = 1 - MAPE)
accuracy = 1 - mape

# Convert accuracy to percentage
percentage_accuracy = accuracy * 100

print("Average Percentage Accuracy:", percentage_accuracy)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END regressor__learning_rate=0.05, regressor__max_depth=3, regressor__n_estimators=100; total time=   0.1s
[CV] END regressor__learning_rate=0.05, regressor__max_depth=3, regressor__n_estimators=100; total time=   0.2s
[CV] END regressor__learning_rate=0.05, regressor__max_depth=3, regressor__n_estimators=100; total time=   0.2s
[CV] END regressor__learning_rate=0.05, regressor__max_depth=3, regressor__n_estimators=100; total time=   0.1s
[CV] END regressor__learning_rate=0.05, regressor__max_depth=3, regressor__n_estimators=100; total time=   0.1s
[CV] END regressor__learning_rate=0.05, regressor__max_depth=3, regressor__n_estimators=200; total time=   0.4s
[CV] END regressor__learning_rate=0.05, regressor__max_depth=3, regressor__n_estimators=200; total time=   0.4s
[CV] END regressor__learning_rate=0.05, regressor__max_depth=3, regressor__n_estimators=200; total time=   0.4s
[CV] END regressor__learning_rate=0.05, re

In [8]:
# # Define the pipeline
# model = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('regressor', LinearRegression())
# ])

# # Define grid search parameters
# param_grid = {}

# # Perform grid search
# grid_search = GridSearchCV(model, param_grid, cv=5, verbose=2)
# grid_search.fit(X_train, y_train)
# best_model_linear_regression = grid_search.best_estimator_

# # Predict on test set
# y_pred = best_model_linear_regression.predict(X_test)

# # Print predicted and real y-values for the first 10 rows
# for i in range(10):
#     print("Predicted:", y_pred[i], "\tReal:", y_test.iloc[i])

# # Calculate absolute percentage error for each prediction
# absolute_percentage_errors = np.abs((y_test - y_pred) / y_test)

# # Calculate mean absolute percentage error
# mape = np.mean(absolute_percentage_errors)

# # Convert MAPE to accuracy (accuracy = 1 - MAPE)
# accuracy = 1 - mape

# # Convert accuracy to percentage
# percentage_accuracy = accuracy * 100

# print("Average Percentage Accuracy:", percentage_accuracy)