In [2]:
# Import libraries needed to execute the code
import os
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import scipy.cluster.hierarchy as shc
from sklearn.decomposition import PCA
from IPython.display import display, HTML
from scipy.stats import spearmanr, chi2_contingency
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from pandas.plotting import andrews_curves, parallel_coordinates, lag_plot, autocorrelation_plot, radviz

In [3]:
# Import the clean data
data = pd.read_csv('source/data.csv', low_memory=False)

In [None]:
print(data['column_name'].unique()


In [4]:
data = data.astype({
    'AccID': 'object',
    'vehicleID': 'object',
    'num_veh': 'object',
    'birth_year': 'int64',
    'age': 'int64',  
})

In [5]:
numeric_data = data.select_dtypes(include=['float64', 'int64'])
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_data)

In [6]:
# Initialize PCA, you can specify the number of components or retain a percentage of variance
pca = PCA(n_components=0.95)  # Retain 95% of the variance
principal_components = pca.fit_transform(scaled_data)

# Convert the results back to a DataFrame
pca_data = pd.DataFrame(data=principal_components)

In [7]:
print(pca.explained_variance_ratio_)  # To see the variance captured by each component
print(pca_data.head())  # To see the reduced dataset


[0.07597163 0.06332288 0.05380105 0.05131074 0.04534719 0.03683819
 0.03397149 0.03304992 0.03157556 0.02884636 0.02824769 0.02763357
 0.02725895 0.02703871 0.02649284 0.02637093 0.02604286 0.02594794
 0.02576185 0.02482147 0.0242146  0.02368815 0.02323048 0.02255537
 0.02105639 0.02098975 0.02034115 0.01979569 0.01923447 0.01783853
 0.01614197 0.01531973]
         0         1         2         3         4         5         6   \
0  3.014244  1.789851 -1.207339 -1.167509 -2.533176  2.213368 -2.073746   
1  3.512119  0.875707 -0.812406 -1.105911 -1.586161  2.261583 -2.013048   
2  3.830175 -0.097962  1.127285 -0.398087 -0.388565  2.841961 -2.202595   
3  1.666309  2.657365 -0.079354  0.186165  1.669973 -2.270644  1.536085   
4  4.327757  0.966177 -0.703562 -0.934391 -0.720471  1.158960 -0.838598   

         7         8         9   ...        22        23        24        25  \
0  1.571086  0.730348  1.788258  ...  0.671986 -0.791710  0.828749 -0.387187   
1  1.462647  0.942513  1.76736

In [8]:
pca_data.shape[0]

469719

In [9]:
correlation_matrix = numeric_data.corr()
correlation_matrix

Unnamed: 0,day,month,year,lum,dep_code,location,int,atm_condition,collision_type,lat,...,manv,motor,seat,user_category,gravity,gender,birth_year,reason_travel,safety_equipment1,age
day,1.0,-0.01792,-0.001389,-0.00584,-0.005361,-0.009665,0.001221,-0.01434,-0.004989,0.005745,...,-0.001359,0.002927,-0.003186,-0.001682,-0.000343,-0.00049,0.001232,0.008354,0.0006,-0.001319
month,-0.01792,1.0,-0.005889,0.072325,0.000655,-0.016758,-0.002591,0.028078,-0.011171,-0.000133,...,-0.010411,0.00256,-0.00242,-0.000179,0.000132,0.004051,0.009984,-0.013786,-0.012052,-0.010364
year,-0.001389,-0.005889,1.0,-0.009764,0.012692,-0.006328,0.007226,-0.007476,-0.026878,-0.006615,...,-0.012673,0.043142,-0.016286,-0.016459,-0.007661,-0.0026,0.068253,-0.007837,-0.043073,-0.007922
lum,-0.00584,0.072325,-0.009764,1.0,0.016985,0.10786,0.023783,0.009205,0.04828,0.004239,...,-0.000298,-0.037455,0.010386,0.034694,0.030824,-0.051682,0.135715,0.016231,0.033498,-0.136619
dep_code,-0.005361,0.000655,0.012692,0.016985,1.0,-0.009075,0.015508,-0.041294,-0.008585,-0.903389,...,0.0246,-0.011582,0.005947,0.008495,-0.003827,-0.008146,0.025225,-0.028301,0.002939,-0.024515
location,-0.009665,-0.016758,-0.006328,0.10786,-0.009075,1.0,0.179311,-0.02169,0.043649,0.032413,...,0.104537,0.09253,0.111675,0.0837,-0.016339,0.004589,-0.005603,-0.035594,0.094907,0.005234
int,0.001221,-0.002591,0.007226,0.023783,0.015508,0.179311,1.0,0.014001,-0.037075,-0.006626,...,0.084976,0.055136,-0.003412,-0.007353,-0.013759,0.003922,-0.025612,0.005188,0.029689,0.026108
atm_condition,-0.01434,0.028078,-0.007476,0.009205,-0.041294,-0.02169,0.014001,1.0,0.027021,0.04779,...,-0.003107,-0.002749,0.024942,0.021236,0.001371,0.013609,-0.015572,-0.001195,-0.000702,0.015156
collision_type,-0.004989,-0.011171,-0.026878,0.04828,-0.008585,0.043649,-0.037075,0.027021,1.0,0.016632,...,-0.001292,-0.014481,0.300536,0.267286,0.062948,0.019904,-0.017767,0.037686,0.033849,0.016182
lat,0.005745,-0.000133,-0.006615,0.004239,-0.903389,0.032413,-0.006626,0.04779,0.016632,1.0,...,-0.016045,0.019841,-0.003543,-0.008098,0.00395,0.000968,-0.014847,0.016422,0.002884,0.014481


In [10]:
# Set a threshold for identifying highly correlated features
threshold = 0.9

# Find pairs of features with a correlation higher than the threshold
correlated_features = set()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > threshold:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

print(f"Highly correlated features: {correlated_features}")


Highly correlated features: {'lat', 'age'}


In [11]:
data_reduced = data.drop(columns=correlated_features)
print(data_reduced.head())


          AccID  day  month  year   time  lum  dep_code com_code  location  \
0  201900000001   30     11  2019  01:30    4      93.0    93053         1   
1  201900000001   30     11  2019  01:30    4      93.0    93053         1   
2  201900000001   30     11  2019  01:30    4      93.0    93053         1   
3  201900000002   30     11  2019  02:50    3      93.0    93066         1   
4  201900000003   28     11  2019  15:15    1      92.0    92036         1   

   int  ...  initial_impact_point  manv motor  seat  user_category gravity  \
0    1  ...                     5    23     1     2              2       4   
1    1  ...                     5    23     1     1              1       4   
2    1  ...                     3    11     1     1              1       1   
3    1  ...                     1     0     1     1              1       4   
4    1  ...                     1     2     1     1              1       1   

   gender  birth_year  reason_travel  safety_equipment1  
0   

In [12]:
data_reduced.shape[0]

469719

In [13]:
#Calculate 'age'
data_reduced['age'] = data_reduced['year'] - data_reduced['birth_year']

In [14]:
data_reduced.to_csv('reduced_dataset.csv', index=False)

In [15]:
data_reduced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 469719 entries, 0 to 469718
Data columns (total 45 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   AccID                       469719 non-null  object 
 1   day                         469719 non-null  int64  
 2   month                       469719 non-null  int64  
 3   year                        469719 non-null  int64  
 4   time                        469719 non-null  object 
 5   lum                         469719 non-null  int64  
 6   dep_code                    469719 non-null  float64
 7   com_code                    469719 non-null  object 
 8   location                    469719 non-null  int64  
 9   int                         469719 non-null  int64  
 10  atm_condition               469719 non-null  int64  
 11  collision_type              469719 non-null  int64  
 12  address                     469719 non-null  object 
 13  long          

In [None]:
# Convert specified fields to the desired data types
data = data.astype({
    'AccID': 'object',
    'lum': 'object',
    'dep_code': 'object',
    'com_code': 'object',
    'location': 'object',
    'int': 'object',
    'atm_condition': 'object',
    'collision_type': 'object',
    'address': 'object',
    'route_category': 'object',
    'route_number': 'object',
    'route_number_index1': 'object',
    'traffic_regime': 'object',
    'reserved_lane_code': 'object',
    'longitudinal_profile': 'object',
    'plan': 'object',
    'surface_condition': 'object',
    'infra': 'object',
    'accident_situation': 'object',
    'vehicleID': 'object',
    'num_veh': 'object',
    'traffic_direction': 'object',
    'vehicle_category': 'object',
    'fixed_obstacle': 'object',
    'mobile_obstacle': 'object',
    'initial_impact_point': 'object',
    'manv': 'object',
    'motor': 'object',
    'seat': 'object',
    'user_category': 'object',
    'gravity': 'object',
    'gender': 'object',
    'reason_travel': 'object',
    'safety_equipment1': 'object',
    'birth_year': 'int64',
    'age': 'int64'       
})

In [None]:
# Extracting all numeric columns from the dataset for normalization
numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Applying Min-Max normalization to all numeric columns
min_max_scaler = MinMaxScaler()
normalized_data_all = min_max_scaler.fit_transform(data[numeric_columns])

# Creating a DataFrame with the fully normalized data
normalized_df_all = pd.DataFrame(normalized_data_all, columns=numeric_columns)

# Plotting the normalized data for all numeric columns
num_columns = len(numeric_columns)
plt.figure(figsize=(18, 15))

# Creating subplots for each column
for i, column in enumerate(numeric_columns, 1):
    plt.subplot((num_columns // 4) + 1, 4, i)
    plt.hist(normalized_df_all[column], bins=50, color='purple', alpha=0.7)
    plt.title(f'Distribution of {column} (Normalized)')
    plt.xlabel(column)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()


In [None]:
# Selecting categorical columns
categorical_columns = data.select_dtypes(include=['object']).columns

# Initialize a dictionary to store chi-square results between 'gravity' and all other categorical variables
chi_square_results_gravity = {}

# Perform chi-square tests between 'gravity' and each categorical variable
for var in categorical_columns:
    if var != 'gravity':
        contingency_table = pd.crosstab(data['gravity'], data[var])
        chi2, p, dof, expected = chi2_contingency(contingency_table)
        chi_square_results_gravity[var] = p

# Filter and sort significant results with p-values < 0.05
significant_chi_square_results_gravity = {k: v for k, v in chi_square_results_gravity.items() if v < 0.05}
significant_chi_square_results_gravity = sorted(significant_chi_square_results_gravity.items(), key=lambda item: item[1])

significant_chi_square_results_gravity 



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import pandas as pd

# Assuming `data` is your dataframe and `gravity` is your target variable
X = data.drop('gravity', axis=1)
y = data['gravity'].astype(int)  # Ensure gravity is in numeric form if necessary

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Feature importance (coefficients)
importance = model.coef_[0]

# Print the feature importance
for i, v in enumerate(importance):
    print(f'Feature: {X.columns[i]}, Score: {v}')

# Evaluate model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


In [None]:
# Separate features into numerical and categorical
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

X_numerical = X[numerical_features]
X_categorical = X[categorical_features]


In [None]:
from sklearn.preprocessing import OneHotEncoder

# OneHotEncode categorical features
encoder = OneHotEncoder(sparse=False, drop='first')  # drop='first' to avoid multicollinearity
X_categorical_encoded = encoder.fit_transform(X_categorical)

# Convert encoded features back to DataFrame
X_categorical_encoded = pd.DataFrame(X_categorical_encoded, columns=encoder.get_feature_names_out(categorical_features))


In [None]:
from sklearn.preprocessing import StandardScaler

# Standardize numerical features
scaler = StandardScaler()
X_numerical_scaled = scaler.fit_transform(X_numerical)
X_numerical_scaled = pd.DataFrame(X_numerical_scaled, columns=numerical_features)


In [None]:
# Combine numerical and encoded categorical features
X_processed = pd.concat([X_numerical_scaled, X_categorical_encoded], axis=1)


In [None]:
# Now split the data and train the model
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Train Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Feature importance (coefficients)
importance = model.coef_[0]

# Print the feature importance
for i, v in enumerate(importance):
    print(f'Feature: {X_processed.columns[i]}, Score: {v}')

# Evaluate model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
