In [225]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import GridSearchCV, KFold
    
from sklearn.svm import SVR
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [226]:
#chargees le dataset
dataset = pd.read_csv('terrorist_attacks_data.csv')

#verifiez le taille et structure 
print(dataset.shape)
dataset.head()

(10517, 4)


Unnamed: 0,entity,code,year,terrorist_attacks
0,Afghanistan,AFG,1970,0
1,Afghanistan,AFG,1971,0
2,Afghanistan,AFG,1972,0
3,Afghanistan,AFG,1973,1
4,Afghanistan,AFG,1974,0


In [227]:
dataset.isnull().sum()

entity                  0
code                 1254
year                    0
terrorist_attacks       0
dtype: int64

In [228]:
#drop le colonne sex et drop les row avec valeurs null
dataset = dataset.drop('code', axis=1)
print(dataset.shape)
dataset.head()

(10517, 3)


Unnamed: 0,entity,year,terrorist_attacks
0,Afghanistan,1970,0
1,Afghanistan,1971,0
2,Afghanistan,1972,0
3,Afghanistan,1973,1
4,Afghanistan,1974,0


In [229]:
dataset.columns

Index(['entity', 'year', 'terrorist_attacks'], dtype='object')

In [230]:
dataset.isnull().sum()

entity               0
year                 0
terrorist_attacks    0
dtype: int64

In [231]:
X = dataset.drop('terrorist_attacks', axis=1)
y = dataset['terrorist_attacks']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [232]:
param_grid = {
    'C': [0.1, 1, 10, 100], 
    'gamma': ['scale','auto'], 
    'kernel': ['rbf', 'linear']
    }

In [233]:
grid_search = GridSearchCV(SVR(), param_grid, cv=5, verbose=1)

In [234]:
def separate_column_types(df):

    numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

    return numerical_cols, categorical_cols

numerical_cols, categorical_cols = separate_column_types(dataset)

In [235]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
])

# Categorical pipeline
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
])

# Full preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, numerical_cols),
        ('cat', cat_pipeline, categorical_cols),
    ])

In [236]:
full_pipeline = make_pipeline(preprocessor, grid_search)

In [237]:
full_pipeline.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [238]:
y = y.reset_index(drop=True)

# Now proceed with the K-Fold cross-validation as before
kf = KFold(n_splits=10, shuffle=True, random_state=42)

mse_scores = []
rmse_scores = []
r2_scores = []

for train_index, test_index in kf.split(X):
    X_train_kf, X_test_kf = X.iloc[train_index], X.iloc[test_index]
    y_train_kf, y_test_kf = y.iloc[train_index], y.iloc[test_index]
    
    # Fit the pipeline to the training data
    full_pipeline.fit(X_train_kf, y_train_kf)
    
    # Predict on the testing set
    y_pred_kf = full_pipeline.predict(X_test_kf)
    
    # Calculate metrics
    mse = mean_squared_error(y_test_kf, y_pred_kf)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test_kf, y_pred_kf)
    
    # Append to lists
    mse_scores.append(mse)
    rmse_scores.append(rmse)
    r2_scores.append(r2)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
