# InitialPredictionTable


In [None]:

import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np
from sklearn.tree import DecisionTreeRegressor, export_graphviz
import graphviz

data = pd.read_csv('datawithTime.csv', nrows=int(1 * len(pd.read_csv('datawithTime.csv'))))

target_cols = ['ACCDMG']

all_feature_cols = ['RAILROAD', 'YEAR', 'MONTH', 'DAY', 'CARS', 'STATE ', 'TEMP', 'VISIBLTY', 'WEATHER',
                    'TRNSPD', 'TONS', 'TYPEQ', 'TRKCLAS', 'TYPTRK', 'POSITON1',
                    'HEADEND1', 'LOADF1', 'EMPTYF1', 'CAUSE', 'ACCTRK',
                    'HIGHSPD', 'hour', 'minute']

results = []

for target in target_cols:
    y_single = data[target]

    k = 10
    selector = SelectKBest(f_classif, k=k)
    X_new = selector.fit_transform(data[all_feature_cols], y_single)

    selected_features = [all_feature_cols[i] for i, is_selected in enumerate(selector.get_support()) if is_selected]
    relevancy_scores = selector.scores_[selector.get_support()]

    sorted_features_and_scores = sorted(zip(selected_features, relevancy_scores), key=lambda x: x[1], reverse=True)
    sorted_selected_features, sorted_relevancy_scores = zip(*sorted_features_and_scores)

    X_train, X_test, y_train, y_test = train_test_split(X_new, y_single, test_size=0.3, random_state=42)
    regression_models = {
        'Linear Regression': LinearRegression(),
        'Decision Tree Regressor': DecisionTreeRegressor(random_state=2000),
        'Random Forest Regressor': RandomForestRegressor(random_state=2000),
        'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=2000),
        'K-Nearest Neighbors Regressor': KNeighborsRegressor()
    }

    for model_name, model in regression_models.items():
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        mad = mean_absolute_error(y_test, y_pred)
        mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

        results.append({
            'Target': target,
            'Model': model_name,
            'R-squared': r2,
            'MSE': mse,
            'RMSE': rmse,
            'MAD': mad,
            'MAPE': mape,
            'Selected Features': ', '.join(sorted_selected_features),
            'Relevancy Scores': ', '.join(map(str, sorted_relevancy_scores))
        })

results_df = pd.DataFrame(results)

results_df.to_csv('regression_metrics_results.csv', index=False)
print("Results saved to 'regression_metrics_results.csv'")


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.feature_selection import SelectKBest, f_classif

data = pd.read_csv('datawithTime.csv', nrows=int(1 * len(pd.read_csv('datawithTime.csv'))))

target_cols = ['CASINJ']

all_feature_cols = ['RAILROAD', 'YEAR', 'MONTH', 'DAY', 'CARS', 'STATE ', 'TEMP', 'VISIBLTY', 'WEATHER',
                    'TRNSPD', 'TONS', 'TYPEQ', 'TRKCLAS', 'TYPTRK', 'POSITON1',
                    'HEADEND1', 'LOADF1', 'EMPTYF1', 'CAUSE', 'ACCTRK',
                    'HIGHSPD', 'hour', 'minute']

results = {}

for target in target_cols:
    y_single = data[target]
    X = data[all_feature_cols]

    X_train, X_test, y_train, y_test = train_test_split(X, y_single, test_size=0.3, random_state=42)

    param_grid = {'k': [10]}

    linear_reg = LinearRegression()

    selector = SelectKBest(f_classif)

    grid_search = GridSearchCV(selector, param_grid, scoring='neg_mean_absolute_error', cv=5)

    grid_search.fit(X_train, y_train)

    best_selector = grid_search.best_estimator_

    X_train_selected = best_selector.transform(X_train)
    X_test_selected = best_selector.transform(X_test)

    selected_feature_indices = best_selector.get_support(indices=True)

    selected_feature_names = [all_feature_cols[i] for i in selected_feature_indices]

    linear_reg.fit(X_train_selected, y_train)

    y_pred = linear_reg.predict(X_test_selected)

    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred) * 100
    percent_within_5 = np.sum(np.abs(y_test - y_pred) <= 0.05 * y_test.size) / y_test.size * 100
    r2 = r2_score(y_test, y_pred)

    importance_scores = np.abs(linear_reg.coef_)

    feature_importance = dict(zip(selected_feature_names, importance_scores))

    sorted_feature_importance = dict(sorted(feature_importance.items(), key=lambda item: item[1], reverse=True))

    results[target] = {
        'Best Parameters': grid_search.best_params_,
        'Selected Features': selected_feature_names,
        'Feature Importance': sorted_feature_importance,
        'Mean Squared Error': mse,
        'Root Mean Squared Error': rmse,
        'Mean Absolute Error': mae,
        'Mean Absolute Percentage Error': mape,
        'Percent Within 5%': percent_within_5,
        'R-squared': r2
    }

for target, metrics in results.items():
    print(f"Target: {target}")
    for metric, value in metrics.items():
        if metric == 'Feature Importance':
            print(f"{metric}:")
            for feature, importance in value.items():
                print(f"   {feature}: {importance}")
        else:
            print(f"{metric}: {value}")
    print("----------------------")
