In [None]:
# Import libraries needed to execute the code
import os
import warnings
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from scipy.stats import zscore
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
# Import the clean data
data = pd.read_csv('source/data.csv', low_memory=False)

In [None]:
data = data.astype({
    'AccID': 'object',
    'vehicleID': 'object',
    'num_veh': 'object',
    'birth_year': 'int64',
    'age': 'int64',  
})

In [None]:
data.info()

In [None]:
pd.set_option('display.max_columns', None)
data.head()

In [None]:
# Selecting numerical columns for outlier detection
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns

# Plotting box plots for numerical columns to visually inspect outliers
plt.figure(figsize=(15, 10))
data[numerical_columns].boxplot(rot=90)
plt.title('Boxplot for Outlier Detection in Numerical Columns')
plt.show()

In [None]:
# Convert non-numeric columns to numeric using LabelEncoder
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# Apply Z-score normalization to identified columns
columns_to_normalize = ['long', 'lat']
data_normalized = data.copy()
data_normalized[columns_to_normalize] = data[columns_to_normalize].apply(zscore)

# Atarget variable
target_column = 'gravity'

# Split the original data
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=[target_column]), data[target_column], test_size=0.3, random_state=42)

# Train an SVM on the original data
model_svm_original = SVC(random_state=42)
model_svm_original.fit(X_train, y_train)
y_pred_svm_original = model_svm_original.predict(X_test)
accuracy_svm_original = accuracy_score(y_test, y_pred_svm_original)

# Split the normalized data
X_train_norm, X_test_norm, y_train_norm, y_test_norm = train_test_split(data_normalized.drop(columns=[target_column]), data_normalized[target_column], test_size=0.3, random_state=42)

# Train an SVM on the normalized data
model_svm_normalized = SVC(random_state=42)
model_svm_normalized.fit(X_train_norm, y_train_norm)
y_pred_svm_normalized = model_svm_normalized.predict(X_test_norm)
accuracy_svm_normalized = accuracy_score(y_test_norm, y_pred_svm_normalized)

print("SVM Accuracy before normalization:", accuracy_svm_original)
print("SVM Accuracy after normalization:", accuracy_svm_normalized)
