<a href="https://colab.research.google.com/github/andreidinca98/tric_project/blob/grid/grid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
file_path = 'terrorist-attacks new.csv'
data = pd.read_csv(file_path)

# Prepare the data
data = data.rename(columns={'Entity': 'Country', 'Terrorist attacks': 'Attacks'})
data = data[['Country', 'Year', 'Attacks']]  # Remove unnecessary columns

# Encode the country names as numerical labels
label_encoder = LabelEncoder()
data['Country_encoded'] = label_encoder.fit_transform(data['Country'])

# Define features (Country and Year) and target (Attacks)
X = data[['Country_encoded', 'Year']]
y = data['Attacks']

# Manually normalize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.1, random_state=42)

# Define the Random Forest Regressor model
model = RandomForestRegressor(random_state=42)

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [10, 20, 50, 100, 200, 500, 1000],           # Number of trees in the forest
    'max_depth': [None, 10, 20, 30, 40, 50, 60],         # Maximum depth of the trees
    'min_samples_split': [2, 5, 10, 15, 20, 25],         # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4, 6, 8, 10],           # Minimum number of samples required at each leaf node
    'max_features': ['auto', 'sqrt'],        # Number of features to consider for the best split
    'bootstrap': [True, False],              # Whether bootstrap samples are used when building trees
}

# Set up GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',  # Use negative MSE for minimization
    cv=2,                              # 5-fold cross-validation
    verbose=1,                         # Display progress
    n_jobs=-1                          # Use all available cores
)

# Fit the model using GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best model, parameters, and score
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = -grid_search.best_score_

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the results
print(f"Best Hyperparameters: {best_params}")
print(f"Best Cross-Validated MSE: {best_score}")
print(f"Test Set MSE: {mse}")
print(f"R^2 Score on Test Set: {r2}")

Fitting 2 folds for each of 7056 candidates, totalling 14112 fits


7056 fits failed out of a total of 14112.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2240 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError

Best Hyperparameters: {'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 10}
Best Cross-Validated MSE: 77887.61756086859
Test Set MSE: 53303.33983821395
R^2 Score on Test Set: 0.7681136274275799
