# Linear Regressors

## Load libraries

In [1]:
# Importar librerías que vamos a estar usando en general
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm, trange

# Importar funcions per evaluar els models
from utils import evaluate_models
from utils import plot_model_evaluation
from utils import plot_linear_model_feature_importance
from sklearn.metrics import confusion_matrix

## Load data

In [2]:
pth = "C:/Users/34627/Documents/GitHub/Gorrapiedra-proyecto/data/formated data"

df_20 = pd.read_csv(os.path.join(pth, "data_2020.csv"), index_col=False, skipinitialspace=True, skip_blank_lines=True)
df_21 = pd.read_csv(os.path.join(pth, "data_2021.csv"), index_col=False, skipinitialspace=True, skip_blank_lines=True)
df_22 = pd.read_csv(os.path.join(pth, "data_2022.csv"), index_col=False, skipinitialspace=True, skip_blank_lines=True)
df_23 = pd.read_csv(os.path.join(pth, "data_2023.csv"), index_col=False, skipinitialspace=True, skip_blank_lines=True)
df = pd.concat([df_20, df_21, df_22, df_23], ignore_index = True)

if 'Unnamed: 0' in df.columns:
    df = df.drop(['Unnamed: 0'], axis=1)   # Eliminar columna Unnamed

# Visualitzar el df
df.head()

Unnamed: 0,station_id,year,month,day,hour,ctx-4,ctx-3,ctx-2,ctx-1,percentage_docks_available
0,1.0,2020.0,1.0,1.0,2.0,0.522727,0.503788,0.469697,0.403409,0.354167
1,1.0,2020.0,1.0,1.0,7.0,0.289773,0.373106,0.304924,0.238636,0.24053
2,1.0,2020.0,1.0,1.0,12.0,0.253788,0.268939,0.350379,0.344697,0.393939
3,1.0,2020.0,1.0,1.0,17.0,0.390152,0.346591,0.255814,0.22093,0.186047
4,1.0,2020.0,1.0,1.0,22.0,0.083333,0.218992,0.437984,0.515504,0.47093


In [3]:
# Separar les característiques (X) de la variable objectiu (y)
X = df.drop(['percentage_docks_available'], axis=1)
y = df['percentage_docks_available']

In [4]:
# Dividim les dades en conjunts de training i testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Linear Regression Simple

In [None]:
# Creem model de regressió lineal
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

model_linear = LinearRegression()

# Entrenem el model
model_linear.fit(X_train, y_train)

# Fem les prediccions
y_pred_linear = model_linear.predict(X_test)

In [None]:
# Evaluem el model
results = evaluate_models(y_test, y_pred_linear)
print(results)

In [None]:
# Evaluem el model
plot_model_evaluation(y_test, y_pred_linear)

In [None]:
# Evaluem el model
plot_linear_model_feature_importance(model_linear.coef_, X)

In [None]:
# Evaluem el model
threshold = 0.5
y_pred_labels = [1 if p > threshold else 0 for p in X_test]
y_test_labels = [1 if p > threshold else 0 for p in y_test.values]

# Now use y_pred_labels (containing class labels) for the confusion matrix
cm = confusion_matrix(y_test_labels, y_pred_labels)
print(cm)

In [None]:
# Feature importance
importance = model_linear.coef_

# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))

# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()

## Random forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Crear el model de regressió RandomForest
model_random = RandomForestRegressor(n_estimators=7, random_state=42)

# Entrenem el model
model_random.fit(X_train, y_train)

# Fem les prediccions
y_pred_random = model_random.predict(X_test)

In [None]:
# Evaluem el model
results = evaluate_models(y_test, y_pred_random)
print(results)

In [None]:
# Evaluem el model
plot_model_evaluation(y_test, y_pred_random)

In [None]:
# Evaluem el model
plot_linear_model_feature_importance(model_linear.coef_, X)

In [None]:
# Feature importance for Random Forest
importance = model_random.feature_importances_

# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))

# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()


## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

model_tree = DecisionTreeRegressor(max_depth=7)

model_tree.fit(X_train, y_train)

y_pred_tree = model_tree.predict(X_test)

In [None]:
# Evaluem el model
results = evaluate_models(y_test, y_pred_random)
print(results)

In [None]:
# Evaluem el model
plot_model_evaluation(y_test, y_pred_random)

In [None]:
# Evaluem el model
plot_linear_model_feature_importance(model_linear.coef_, X)

In [None]:
# Feature importance for Decision Tree

importance = model_tree.feature_importances_

# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))

# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()