# OverUnderSampling+Prediction_EVACUATE


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

data = pd.read_csv('datawithTime.csv')

feature_columns = ['CARS', 'TEMP', 'TRNSPD', 'TONS', 'POSITON1', 'HEADEND1', 'LOADF1', 'EMPTYF1', 'HIGHSPD', 'hour', 'minute', 'RAILROAD', 'YEAR', 'MONTH', 'DAY', 'STATE ', 'VISIBLTY', 'WEATHER', 'TYPEQ', 'TRKCLAS', 'TYPTRK', 'CAUSE', 'ACCTRK']
target_column = 'EVACUATE'

X = data[feature_columns]
y = data[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

linear_reg = LinearRegression()
decision_tree_reg = DecisionTreeRegressor()
random_forest_reg = RandomForestRegressor()
gradient_boosting_reg = GradientBoostingRegressor()
k_neighbors_reg = KNeighborsRegressor()

models = [linear_reg, decision_tree_reg, random_forest_reg, gradient_boosting_reg, k_neighbors_reg]
model_names = ['Linear Regression', 'Decision Tree Regression', 'Random Forest Regression', 'Gradient Boosting Regression', 'K-Nearest Neighbors Regression']

metrics = {
    'R^2': r2_score,
    'RMSE': mean_squared_error,
    'MAE': mean_absolute_error,
    'MAPE': lambda y_true, y_pred: np.mean(np.abs((y_true - y_pred) / y_true)) * 100
}

accuracy_report = {}

for model, model_name in zip(models, model_names):
    model_accuracy = {}
    print(f"Model: {model_name}")

    # Train the model on the training set
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    for metric_name, metric_func in metrics.items():
        if metric_name == 'R^2' and isinstance(model, LinearRegression):
            score = metric_func(y_test, y_pred)
        else:
            score = metric_func(y_test, y_pred)
        model_accuracy[metric_name] = score
        print(f"{metric_name}: {score:.2f}")

    accuracy_report[model_name] = model_accuracy


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor

data = pd.read_csv('datawithTime.csv')

feature_columns = ['CARS', 'TEMP', 'TRNSPD', 'TONS', 'POSITON1', 'HEADEND1', 'LOADF1', 'EMPTYF1', 'HIGHSPD', 'hour', 'minute', 'RAILROAD', 'YEAR', 'MONTH', 'DAY', 'STATE ', 'VISIBLTY', 'WEATHER', 'TYPEQ', 'TRKCLAS', 'TYPTRK', 'CAUSE', 'ACCTRK']
target_column = 'EVACUATE'

X = data[feature_columns]
y = data[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

linear_reg = LinearRegression()
decision_tree_reg = DecisionTreeRegressor()
random_forest_reg = RandomForestRegressor()
gradient_boosting_reg = GradientBoostingRegressor()
k_neighbors_reg = KNeighborsRegressor()

models = [linear_reg, decision_tree_reg, random_forest_reg, gradient_boosting_reg, k_neighbors_reg]
model_names = ['Linear Regression', 'Decision Tree Regression', 'Random Forest Regression', 'Gradient Boosting Regression', 'K-Nearest Neighbors Regression']

for model, model_name in zip(models, model_names):
    print(f"Model: {model_name}")
    if isinstance(model, LinearRegression):
        model.fit(X_train, y_train)  # Fit the Linear Regression model
        slope = model.coef_
        intercept = model.intercept_
        print(f"Slope: {slope}")
        print(f"Intercept: {intercept}")
    params = model.get_params()
    for param_name, param_value in params.items():
        print(f"{param_name}: {param_value}")
    print("\n")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv('datawithTime.csv')

feature_columns = ['CARS', 'TEMP', 'TRNSPD', 'TONS', 'POSITON1', 'HEADEND1', 'LOADF1', 'EMPTYF1', 'HIGHSPD', 'hour', 'minute', 'RAILROAD', 'YEAR', 'MONTH', 'DAY', 'STATE ', 'VISIBLTY', 'WEATHER', 'TYPEQ', 'TRKCLAS', 'TYPTRK', 'CAUSE', 'ACCTRK']

data['EVACUATE_categories'] = pd.cut(data['EVACUATE'], bins=[-float('inf'), 0, 200, float('inf')], labels=['EVACUATE=0', '0<EVACUATE<=200', 'EVACUATE>200'])

category_counts = data['EVACUATE_categories'].value_counts()
category_counts.plot(kind='bar', rot=0)
plt.title('Frequency of EVACUATE Categories')
plt.xlabel('EVACUATE Categories')
plt.ylabel('Frequency')
plt.show()

print("Number of Records in Each Category (initial state):")
print(category_counts)



In [None]:
from imblearn.over_sampling import RandomOverSampler

oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)
# data = data.drop(columns=['EVACUATE'])
X = data[feature_columns + ['EVACUATE']]
y = data['EVACUATE_categories']
X_resampled, y_resampled = oversampler.fit_resample(X, y)

resampled_data = pd.concat([X_resampled, y_resampled], axis=1)
print(resampled_data.columns)
resampled_category_counts = resampled_data['EVACUATE_categories'].value_counts()
resampled_category_counts.plot(kind='bar', rot=0)
plt.title('Frequency of EVACUATE Categories (After Oversampling)')
plt.xlabel('EVACUATE Categories')
plt.ylabel('Frequency')
plt.show()
print("Number of Records in Each Category (After Oversampling):")
print(resampled_category_counts)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

feature_columns = ['CARS', 'TEMP', 'TRNSPD', 'TONS', 'POSITON1', 'HEADEND1', 'LOADF1', 'EMPTYF1', 'HIGHSPD', 'hour', 'minute', 'RAILROAD', 'YEAR', 'MONTH', 'DAY', 'STATE ', 'VISIBLTY', 'WEATHER', 'TYPEQ', 'TRKCLAS', 'TYPTRK', 'CAUSE', 'ACCTRK']
target_column = 'EVACUATE'

X = resampled_data[feature_columns]
y = resampled_data[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

linear_reg = LinearRegression()
decision_tree_reg = DecisionTreeRegressor()
random_forest_reg = RandomForestRegressor()
gradient_boosting_reg = GradientBoostingRegressor()
k_neighbors_reg = KNeighborsRegressor()

models = [linear_reg, decision_tree_reg, random_forest_reg, gradient_boosting_reg, k_neighbors_reg]
model_names = ['Linear Regression', 'Decision Tree Regression', 'Random Forest Regression', 'Gradient Boosting Regression', 'K-Nearest Neighbors Regression']

metrics = {
    'R^2': r2_score,
    'RMSE': mean_squared_error,
    'MAE': mean_absolute_error,
    'MAPE': lambda y_true, y_pred: np.mean(np.abs((y_true - y_pred) / y_true)) * 100
}

accuracy_report = {}

for model, model_name in zip(models, model_names):
    model_accuracy = {}
    print(f"Model: {model_name}")

    # Train the model on the training set
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    for metric_name, metric_func in metrics.items():
        score = metric_func(y_test, y_pred)
        model_accuracy[metric_name] = score
        print(f"{metric_name}: {score:.2f}")

    accuracy_report[model_name] = model_accuracy



In [None]:

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np
data = pd.read_csv('datawithTime.csv')

feature_columns = ['CARS', 'TEMP', 'TRNSPD', 'TONS', 'POSITON1', 'HEADEND1', 'LOADF1', 'EMPTYF1', 'HIGHSPD', 'hour', 'minute', 'RAILROAD', 'YEAR', 'MONTH', 'DAY', 'STATE ', 'VISIBLTY', 'WEATHER', 'TYPEQ', 'TRKCLAS', 'TYPTRK', 'CAUSE', 'ACCTRK']
target_column = 'EVACUATE'

data['EVACUATE_categories'] = pd.cut(data['EVACUATE'], bins=[-float('inf'), 0, 200, float('inf')], labels=['EVACUATE=0', '0<EVACUATE<=200', 'EVACUATE>200'])

majority_class = 'EVACUATE=0'

desired_samples_in_majority = 1000  # Number of samples for 'EVACUATE>200'
desired_samples_in_minority_1or2 = 313  # Number of samples for '0<EVACUATE<=200'
desired_samples_in_minority_gt2 = 87  # Number of samples for 'EVACUATE>200'

majority_data = data[data['EVACUATE_categories'] == majority_class]
minority_data_1or2 = data[data['EVACUATE_categories'] == '0<EVACUATE<=200']
minority_data_gt2 = data[data['EVACUATE_categories'] == 'EVACUATE>200']

undersampled_majority = resample(majority_data, replace=False, n_samples=desired_samples_in_majority, random_state=42)

undersampled_minority_1or2 = resample(minority_data_1or2, replace=False, n_samples=desired_samples_in_minority_1or2, random_state=42)
undersampled_minority_gt2 = resample(minority_data_gt2, replace=False, n_samples=desired_samples_in_minority_gt2, random_state=42)

balanced_data = pd.concat([undersampled_majority, undersampled_minority_1or2, undersampled_minority_gt2])

category_counts_US = balanced_data['EVACUATE_categories'].value_counts()
category_counts_US.plot(kind='bar', rot=0)
plt.title('Frequency of EVACUATE Categories (After Undersampling)')
plt.xlabel('EVACUATE Categories')
plt.ylabel('Frequency')
plt.show()

feature_columns = ['CARS', 'TEMP', 'TRNSPD', 'TONS', 'POSITON1', 'HEADEND1', 'LOADF1', 'EMPTYF1', 'HIGHSPD', 'hour', 'minute', 'RAILROAD', 'YEAR', 'MONTH', 'DAY', 'STATE ', 'VISIBLTY', 'WEATHER', 'TYPEQ', 'TRKCLAS', 'TYPTRK', 'CAUSE', 'ACCTRK']
target_column = 'EVACUATE'

X = balanced_data[feature_columns]
y = balanced_data[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

linear_reg = LinearRegression()
decision_tree_reg = DecisionTreeRegressor()
random_forest_reg = RandomForestRegressor()
gradient_boosting_reg = GradientBoostingRegressor()
k_neighbors_reg = KNeighborsRegressor()

models = [linear_reg, decision_tree_reg, random_forest_reg, gradient_boosting_reg, k_neighbors_reg]
model_names = ['Linear Regression', 'Decision Tree Regression', 'Random Forest Regression', 'Gradient Boosting Regression', 'K-Nearest Neighbors Regression']

metrics = {
    'R^2': r2_score,
    'RMSE': mean_squared_error,
    'MAE': mean_absolute_error,
    'MAPE': lambda y_true, y_pred: np.mean(np.abs((y_true - y_pred) / y_true)) * 100
}

accuracy_report = {}

for model, model_name in zip(models, model_names):
    model_accuracy = {}
    print(f"Model: {model_name}")

    # Train the model on the training set
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    for metric_name, metric_func in metrics.items():
        score = metric_func(y_test, y_pred)
        model_accuracy[metric_name] = score
        print(f"{metric_name}: {score:.2f}")

    accuracy_report[model_name] = model_accuracy

