# OverUnderHybridSampling-CARSDMG


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv('datawithTime.csv')

target_columns = ['CARSDMG', 'CASKLD', 'CARSHZD', 'EVACUATE', 'EQPDMG', 'CASINJ', 'ACCDMG']

target_column = 'CARSDMG'

feature_columns = [col for col in data.columns if col not in target_columns]
print(feature_columns)

data['CARSDMG_categories'] = pd.cut(data['CARSDMG'], bins=[-float('inf'), 0, 3, float('inf')], labels=['0', '1 - 3', '>3'])

category_counts = data['CARSDMG_categories'].value_counts()
category_counts.plot(kind='bar', rot=0)
plt.title('Frequency of CARSDMG Categories')
plt.xlabel('CARSDMG Categories')
plt.ylabel('Frequency')
plt.show()

print("Number of Records in Each Category (initial state):")
print(category_counts)



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np

target_column = 'CARSDMG_categories'
data = data.drop(columns=['CARSDMG'])

feature_columns = [col for col in data.columns if col != target_column]

X = data[feature_columns]
y = data[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(random_state=42)

# Train the model on the training data
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)

plt.figure(figsize=(8, 6))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix Normal SoloTarget')
plt.colorbar()
classes = ['CARSDMG = 0', 'CARSDMG = 1-3', 'CARSDMG > 3']
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)

thresh = conf_matrix.max() / 2.0
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        plt.text(j, i, format(conf_matrix[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if conf_matrix[i, j] > thresh else "black")

plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np

target_columns = ['CASKLD', 'EVACUATE', 'CARSHZD', 'CARSDMG_categories', 'EQPDMG', 'CASINJ', 'ACCDMG']

target_column = 'CARSDMG_categories'

feature_columns = [col for col in data.columns if col not in target_columns]

X = data[feature_columns]
y = data[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(random_state=42)

# Train the model on the training data
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)

conf_matrix = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix Normal GroupTarget')
plt.colorbar()
classes = ['CARSDMG = 0', 'CARSDMG = 1-3', 'CARSDMG > 3']
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)

thresh = conf_matrix.max() / 2.0
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        plt.text(j, i, format(conf_matrix[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if conf_matrix[i, j] > thresh else "black")

plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
plt.show()


In [None]:
from imblearn.over_sampling import RandomOverSampler

oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)

X = data[feature_columns]
y = data['CARSDMG_categories']
X_resampled, y_resampled = oversampler.fit_resample(X, y)

resampled_data = pd.concat([X_resampled, y_resampled], axis=1)

resampled_category_counts = resampled_data['CARSDMG_categories'].value_counts()
resampled_category_counts.plot(kind='bar', rot=0)
plt.title('Frequency of CARSDMG Categories (After Oversampling)')
plt.xlabel('CARSDMG Categories')
plt.ylabel('Frequency')
plt.show()
print("Number of Records in Each Category (After Oversampling):")
print(resampled_category_counts)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np

target_column = 'CARSDMG_categories'

feature_columns = [col for col in resampled_data.columns if col != target_column]

X = resampled_data[feature_columns]
y = resampled_data[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(random_state=42)

# Train the model on the training data
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)

plt.figure(figsize=(8, 6))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix Oversampling SoloTarget')
plt.colorbar()
classes = ['CARSDMG = 0', 'CARSDMG = 1-3', 'CARSDMG > 3']
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)

thresh = conf_matrix.max() / 2.0
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        plt.text(j, i, format(conf_matrix[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if conf_matrix[i, j] > thresh else "black")

plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np

target_columns = ['CASKLD', 'EVACUATE', 'CARSHZD', 'CARSDMG_categories', 'EQPDMG', 'CASINJ', 'ACCDMG']

target_column = 'CARSDMG_categories'

feature_columns = [col for col in resampled_data.columns if col not in target_columns]

X = resampled_data[feature_columns]
y = resampled_data[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(random_state=42)

# Train the model on the training data
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)

conf_matrix = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix Oversampling GroupTarget')
plt.colorbar()
classes = ['CARSDMG = 0', 'CARSDMG = 1-3', 'CARSDMG > 3']
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)

thresh = conf_matrix.max() / 2.0
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        plt.text(j, i, format(conf_matrix[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if conf_matrix[i, j] > thresh else "black")

plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
plt.show()


In [None]:

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

data = pd.read_csv('datawithTime.csv')

target_columns = ['CASKLD', 'CARSDMG', 'CARSHZD', 'EVACUATE', 'EQPDMG', 'CASINJ', 'ACCDMG']

feature_columns = [col for col in data.columns if col not in target_columns]

target_column = 'CARSDMG'
data['CARSDMG_categories'] = pd.cut(data['CARSDMG'], bins=[-float('inf'), 0, 3, float('inf')], labels=['CARSDMG = 0', 'CARSDMG = 1-3', 'CARSDMG > 3'])

majority_class = 'CARSDMG = 0'

desired_samples_in_majority = 10000  # Number of samples for 'CARSDMG=0'
desired_samples_in_minority_1or2 = 3000  # Number of samples for '0<CARSDMG=1'
desired_samples_in_minority_gt2 = 1479  # Number of samples for 'CARSDMG>1'

majority_data = data[data['CARSDMG_categories'] == majority_class]
minority_data_1or2 = data[data['CARSDMG_categories'] == 'CARSDMG = 1-3']
minority_data_gt2 = data[data['CARSDMG_categories'] == 'CARSDMG > 3']

undersampled_majority = resample(majority_data, replace=False, n_samples=desired_samples_in_majority, random_state=42)

undersampled_minority_1or2 = resample(minority_data_1or2, replace=False, n_samples=desired_samples_in_minority_1or2, random_state=42)
undersampled_minority_gt2 = resample(minority_data_gt2, replace=False, n_samples=desired_samples_in_minority_gt2, random_state=42)

balanced_data = pd.concat([undersampled_majority, undersampled_minority_1or2, undersampled_minority_gt2])

category_counts_US = balanced_data['CARSDMG_categories'].value_counts()
category_counts_US.plot(kind='bar', rot=0)
plt.title('Frequency of CARSDMG Categories (After Undersampling)')
plt.xlabel('CARSDMG Categories')
plt.ylabel('Frequency')
plt.show()

X = balanced_data[feature_columns]
y = balanced_data['CARSDMG_categories']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(random_state=42)

# Train the model on the training data
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(report)

print("Number of Records in Each Category (After undersampling):")
print(category_counts_US)


In [None]:

conf_matrix = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix Undersampling GroupTarget')
plt.colorbar()
classes = clf.classes_
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)

thresh = conf_matrix.max() / 2.0
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        plt.text(j, i, format(conf_matrix[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if conf_matrix[i, j] > thresh else "black")

plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

data = pd.read_csv('datawithTime.csv')

feature_columns = [col for col in data.columns if col != 'CARSDMG']

data['CARSDMG_categories'] = pd.cut(data['CARSDMG'], bins=[-float('inf'), 0, 3, float('inf')], labels=['CARSDMG = 0', 'CARSDMG = 1-3', 'CARSDMG > 3'])

majority_class = 'CARSDMG = 0'

desired_samples_in_majority = 10000  # Number of samples for 'CARSDMG>200'
desired_samples_in_minority_1or2 = 3000  # Number of samples for '0<CARSDMG<=200'
desired_samples_in_minority_gt2 = 1479  # Number of samples for 'CARSDMG>200'

majority_data = data[data['CARSDMG_categories'] == majority_class]
minority_data_1or2 = data[data['CARSDMG_categories'] == 'CARSDMG = 1-3']
minority_data_gt2 = data[data['CARSDMG_categories'] == 'CARSDMG > 3']

undersampled_majority = resample(majority_data, replace=False, n_samples=desired_samples_in_majority, random_state=42)

undersampled_minority_1or2 = resample(minority_data_1or2, replace=False, n_samples=desired_samples_in_minority_1or2, random_state=42)
undersampled_minority_gt2 = resample(minority_data_gt2, replace=False, n_samples=desired_samples_in_minority_gt2, random_state=42)

balanced_data = pd.concat([undersampled_majority, undersampled_minority_1or2, undersampled_minority_gt2])

category_counts_USsolo = balanced_data['CARSDMG_categories'].value_counts()
category_counts_USsolo.plot(kind='bar', rot=0)
plt.title('Frequency of CARSDMG Categories (After Undersampling)')
plt.xlabel('CARSDMG Categories')
plt.ylabel('Frequency')
plt.show()

X = balanced_data[feature_columns]
y = balanced_data['CARSDMG_categories']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(random_state=42)

# Train the model on the training data
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(report)

print("Number of Records in Each Category (After undersampling solo target):")
print(category_counts_USsolo)


In [None]:

conf_matrix = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix Undersampling SoloTarget')
plt.colorbar()
classes = clf.classes_
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)

thresh = conf_matrix.max() / 2.0
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        plt.text(j, i, format(conf_matrix[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if conf_matrix[i, j] > thresh else "black")

plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

data = pd.read_csv('datawithTime.csv')

feature_columns = [col for col in data.columns if col != 'CARSDMG']

data['CARSDMG_categories'] = pd.cut(data['CARSDMG'], bins=[-float('inf'), 0, 3, float('inf')], labels=['CARSDMG = 0', 'CARSDMG = 1-3', 'CARSDMG > 3'])

majority_class = 'CARSDMG = 0'

desired_samples_majority = 10000
desired_samples_minority_1or2 = 10000
desired_samples_minority_gt2 = 5000

majority_data = data[data['CARSDMG_categories'] == majority_class]
minority_data_1or2 = data[data['CARSDMG_categories'] == 'CARSDMG = 1-3']
minority_data_gt2 = data[data['CARSDMG_categories'] == 'CARSDMG > 3']

undersampled_majority = resample(majority_data, replace=False, n_samples=desired_samples_majority, random_state=42)

oversampled_minority_1or2 = resample(minority_data_1or2, replace=True, n_samples=desired_samples_minority_1or2, random_state=42)
oversampled_minority_gt2 = resample(minority_data_gt2, replace=True, n_samples=desired_samples_minority_gt2, random_state=42)

balanced_data = pd.concat([undersampled_majority, oversampled_minority_1or2, oversampled_minority_gt2])

category_counts_combined = balanced_data['CARSDMG_categories'].value_counts()
category_counts_combined.plot(kind='bar', rot=0)
plt.title('Frequency of CARSDMG Categories (Hybrid Sampling)')
plt.xlabel('CARSDMG Categories')
plt.ylabel('Frequency')
plt.show()

X = balanced_data[feature_columns]
y = balanced_data['CARSDMG_categories']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(random_state=42)

# Train the model on the training data
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(report)

print("Number of Records in Each Category (After Hybrid Sampling):")
print(category_counts_combined)

conf_matrix = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix Hybridsampling SoloTarget')
plt.colorbar()
classes = clf.classes_
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)

thresh = conf_matrix.max() / 2.0
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        plt.text(j, i, format(conf_matrix[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if conf_matrix[i, j] > thresh else "black")

plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

data = pd.read_csv('datawithTime.csv')

target_columns = ['CASKLD', 'CARSDMG', 'CARSHZD', 'EVACUATE', 'EQPDMG', 'CASINJ', 'ACCDMG']

feature_columns = [col for col in data.columns if col not in target_columns]

target_column = 'CARSDMG'

data['CARSDMG_categories'] = pd.cut(data['CARSDMG'], bins=[-float('inf'), 0, 3, float('inf')], labels=['CARSDMG = 0', 'CARSDMG = 1-3', 'CARSDMG > 3'])

majority_class = 'CARSDMG = 0'

desired_samples_majority = 10000
desired_samples_minority_1or2 = 10000
desired_samples_minority_gt2 = 5000

majority_data = data[data['CARSDMG_categories'] == majority_class]
minority_data_1or2 = data[data['CARSDMG_categories'] == 'CARSDMG = 1-3']
minority_data_gt2 = data[data['CARSDMG_categories'] == 'CARSDMG > 3']

undersampled_majority = resample(majority_data, replace=False, n_samples=desired_samples_majority, random_state=42)

oversampled_minority_1or2 = resample(minority_data_1or2, replace=True, n_samples=desired_samples_minority_1or2, random_state=42)
oversampled_minority_gt2 = resample(minority_data_gt2, replace=True, n_samples=desired_samples_minority_gt2, random_state=42)

balanced_data = pd.concat([undersampled_majority, oversampled_minority_1or2, oversampled_minority_gt2])

category_counts_combined = balanced_data['CARSDMG_categories'].value_counts()
category_counts_combined.plot(kind='bar', rot=0)
plt.title('Frequency of CARSDMG Categories (Combined Sampling)')
plt.xlabel('CARSDMG Categories')
plt.ylabel('Frequency')
plt.show()

X = balanced_data[feature_columns]
y = balanced_data['CARSDMG_categories']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(random_state=42)

# Train the model on the training data
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(report)

print("Number of Records in Each Category (After combined sampling):")
print(category_counts_combined)

conf_matrix = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix Hybridsampling GroupTarget')
plt.colorbar()
classes = clf.classes_
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)

thresh = conf_matrix.max() / 2.0
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        plt.text(j, i, format(conf_matrix[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if conf_matrix[i, j] > thresh else "black")

plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
plt.show()
