In this project, we have some pictures with some number of people. These pictures are evaluated by 2 different algorithm to count the people inside each picture. We should build a model to estimate the actual number of people, given the result of 2 algorithms and time in which each picture has taken.

# Importing libraries and data

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import cross_val_score, KFold
from xgboost import XGBRegressor
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
%matplotlib inline

In [None]:
dataset = pd.read_csv('Relevant.txt')
dataset.columns = ['first_algorithm', 'second_algoritm', 'day_time', 'people_count']

# Data evaluation

In [None]:
#Comparison of each algorithms results with real data

#Illustrate the difference of real data and first algorithm
plt.figure(figsize=(20, 15))
plt.subplot(2,1,1)
plt.style.use('fivethirtyeight')
plt.plot(dataset.index, dataset['first_algorithm'], label = 'First Algorithm', linewidth = 1)
plt.plot(dataset.index, dataset['people_count'], label = 'Real Data', c='magenta', linewidth = 1)
plt.ylabel('People Count')
plt.legend(loc =(0.05,0.8))

#Illustrate the difference of real data and second algorithm
plt.figure(figsize=(20, 15))
plt.subplot(2,1,2)
plt.style.use('fivethirtyeight')
plt.plot(dataset.index, dataset['second_algoritm'], label = 'Second Algorithm', c='red', linewidth = 1)
plt.plot(dataset.index, dataset['people_count'], label = 'Real_Data', c='green', linewidth = 1)

plt.ylabel('People Count')
plt.legend(loc =(0.05,0.8))


plt.show()

In [None]:
dataset.describe()

We can conclude that both algorithms are counting more people than the truth

# Testing different regression models

In [None]:
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1)

In [None]:
models = []
models_name = []
LR = LinearRegression()
KNN = KNeighborsRegressor(n_neighbors = 5, metric = 'minkowski', p = 2)
BRR = BayesianRidge()
DT = DecisionTreeRegressor(random_state = 0)
RF = RandomForestRegressor(n_estimators = 10, random_state = 1)
XGB = XGBRegressor(n_estimators=500, max_depth = 5,gamma = 0, objective= 'reg:pseudohubererror')
models.append(LR)
models_name.append('Linear Regression')
models.append(KNN)
models_name.append('K Nearest Neighbor')
models.append(BRR)
models_name.append('Bayesian Ridge Regressor')
models.append(DT)
models_name.append('Decision Tree Regressor')
models.append(RF)
models_name.append('Random Forest Regressor')
models.append(XGB)
models_name.append('XGBoost Regressor')

In [None]:
kfold = KFold(n_splits=10, shuffle=True)
scores = []
for i in range(len(models)):
    score = cross_val_score(models[i], X, y, cv= kfold)
    scores.append(score)
    print(models_name[i],':', round(score.mean(),3),',', round(score.std(),3))


In [None]:
plt.figure(figsize=(30,10))
plt.boxplot(scores, labels = models_name)
plt.show

In [None]:
y_pred = {}
y_pred_test = {}
for i in range(len(models)):
    models[i].fit(X_train, y_train)
    print(models_name[i])
    print('Train Set Score', models[i].score(X_train, y_train))
    print('Test Set Score', models[i].score(X_test, y_test))
    y_pred[models_name[i]] = models[i].predict(X)
    y_pred_test[models_name[i]] = models[i].predict(X_test)
    print('MSE', mean_squared_error(y_test, y_pred_test[models_name[i]]))

In [None]:
plt.figure(figsize=(20, 25))
plt.subplot(611)
plt.style.use('fivethirtyeight')
plt.scatter(dataset.index, dataset['people_count'], label = 'Real_Data', c='green', s = 12)
plt.plot(dataset.index, y_pred['Linear Regression'], label = 'Linear Regression', c='red', linewidth = 1)
plt.ylabel('People Count')
plt.title('Regression Models Prediction')
plt.legend(loc =(0.05,0.8))

plt.figure(figsize=(20, 25))
plt.subplot(612)
plt.style.use('fivethirtyeight')
plt.scatter(dataset.index, dataset['people_count'], label = 'Real_Data', c='green', s = 12)
plt.plot(dataset.index, y_pred['Bayesian Ridge Regressor'], label = 'Bayesian Ridge Regressor', c='red', linewidth = 1)
plt.ylabel('People Count')
plt.legend(loc =(0.05,0.8))

plt.figure(figsize=(20, 25))
plt.subplot(613)
plt.style.use('fivethirtyeight')
plt.scatter(dataset.index, dataset['people_count'], label = 'Real_Data', c='green', s = 12)
plt.plot(dataset.index, y_pred['Decision Tree Regressor'], label = 'Decision Tree Regressor', c='red', linewidth = 1)
plt.ylabel('People Count')
plt.legend(loc =(0.05,0.8))

plt.figure(figsize=(20, 25))
plt.subplot(614)
plt.style.use('fivethirtyeight')
plt.scatter(dataset.index, dataset['people_count'], label = 'Real_Data', c='green', s = 12)
plt.plot(dataset.index, y_pred['Random Forest Regressor'], label = 'Random Forest Regressor', c='red', linewidth = 1)
plt.ylabel('People Count')
plt.legend(loc =(0.05,0.8))

plt.figure(figsize=(20, 25))
plt.subplot(615)
plt.style.use('fivethirtyeight')
plt.scatter(dataset.index, dataset['people_count'], label = 'Real_Data', c='green', s = 12)
plt.plot(dataset.index, y_pred['K Nearest Neighbor'], label = 'K Nearest Neighbor', c='red', linewidth = 1)
plt.ylabel('People Count')
plt.legend(loc =(0.05,0.8))

plt.figure(figsize=(20, 25))
plt.subplot(615)
plt.style.use('fivethirtyeight')
plt.scatter(dataset.index, dataset['people_count'], label = 'Real_Data', c='green', s = 12)
plt.plot(dataset.index, y_pred['XGBoost Regressor'], label = 'XGBoost Regressor', c='red', linewidth = 1)
plt.ylabel('People Count')
plt.legend(loc =(0.05,0.8))

# Hyperparameter Tuning of 2 algorithms

In [None]:
import optuna

In [None]:
model = BRR
model.get_params()

In [None]:
def BRR_objective(BRR_trial):
    alpha1 = BRR_trial.suggest_float('alpha_1', 1e-06, 1000)
    lambda1 = BRR_trial.suggest_float('lambda_1', 1e-06, 1000)
    alpha2 = BRR_trial.suggest_float('alpha_2', 1e-06, 1000)
    lambda2 = BRR_trial.suggest_float('lambda_2', 1e-06, 1000)
    iter_count = BRR_trial.suggest_int('n_iter', 300, 1000)
    model = BayesianRidge(alpha_1= alpha1, lambda_1= lambda1, alpha_2= alpha2, lambda_2= lambda2, n_iter= iter_count)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return(model.score(X_test, y_test))


BRR_study = optuna.create_study(direction='maximize')
BRR_study.optimize(BRR_objective, n_trials= 100, n_jobs=-1)
BRR_trial = BRR_study.best_trial

print('Accuracy: {}'.format(BRR_trial.value))
print("Best hyperparameters: {}".format(BRR_trial.params))

In [None]:
model = BayesianRidge(alpha_1= BRR_trial.params['alpha_1'],
                      alpha_2= BRR_trial.params['alpha_2'],
                      lambda_1= BRR_trial.params['lambda_1'],
                      lambda_2= BRR_trial.params['lambda_2'],
                      n_iter= BRR_trial.params['n_iter'])
model.fit(X_train, y_train)
print('Train Set Score', round(model.score(X_train, y_train), 4))
print('Test Set Score', round(model.score(X_test, y_test), 4))

Y_pred = model.predict(X_test)
print('MSE', round(mean_squared_error(y_test, Y_pred),4))


Due to slight change of the BRR model, I decide to tune hyperparameters of XGB model, with the hope of getting a better accuracy.

In [None]:
model = XGB
model.get_params()

In [None]:
def XGB_objective(XGB_trial):
    max_depth = XGB_trial.suggest_int('max_depth', 2,20)
    min_child_weight = XGB_trial.suggest_int('min_child_weight', 1, 100)
    reg_alpha = XGB_trial.suggest_float('reg_alpha', 0, 10)
    reg_lambda = XGB_trial.suggest_float('reg_lambda', 1, 10)
    n_estimator = XGB_trial.suggest_int('n_estimator', 20, 1000)
    model = XGBRegressor(max_depth= max_depth, min_child_weight= min_child_weight,
                         reg_alpha= reg_alpha, reg_lambda= reg_lambda,
                         n_estimators= n_estimator, objective= 'reg:pseudohubererror')
    model.fit(X_train, y_train)
    y_pred= model.predict(X_test)
    y_pred[y_pred>50] = 50
    return(mean_squared_error(y_test, y_pred))

XGB_study = optuna.create_study(direction='minimize')
XGB_study.optimize(XGB_objective, n_trials= 100, n_jobs=-1)
XGB_trial = XGB_study.best_trial

print('MSE: {}'.format(XGB_trial.value))
print("Best hyperparameters: {}".format(XGB_trial.params))

In [None]:
model = XGBRegressor(max_depth= XGB_trial.params['max_depth'],
                    min_child_weight= XGB_trial.params['min_child_weight'],
                    reg_alpha= XGB_trial.params['reg_alpha'],
                    reg_lambda= XGB_trial.params['reg_lambda'],
                    n_estimators= XGB_trial.params['n_estimator'],
                    objective= 'reg:pseudohubererror')

model.fit(X_train, y_train)
print('Train Set Score', round(model.score(X_train, y_train), 4))
print('Test Set Score', round(model.score(X_test, y_test), 4))

Y_pred = model.predict(X_test)
print('MSE', round(mean_squared_error(y_test, Y_pred),4))