# TRKDMG-Linear


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.feature_selection import SelectKBest, f_classif
import warnings
from sklearn.metrics import mean_absolute_error
from itertools import combinations

data = pd.read_csv('datawithTime.csv', nrows=int(1 * len(pd.read_csv('datawithTime.csv'))))

target_col = 'TRKDMG'

all_feature_cols = ['RAILROAD', 'YEAR', 'MONTH', 'DAY', 'CARS', 'STATE ', 'TEMP', 'VISIBLTY', 'WEATHER',
                    'TRNSPD', 'TONS', 'TYPEQ', 'TRKCLAS', 'TYPTRK', 'POSITON1',
                    'HEADEND1', 'LOADF1', 'EMPTYF1', 'CAUSE', 'ACCTRK',
                    'HIGHSPD', 'hour', 'minute']

X_train, X_test, y_train, y_test = train_test_split(data[all_feature_cols], data[target_col],
                                                    test_size=0.3, random_state=42)

param_grid = {'k': list(range(1, 24))}  # Values from 1 to 23

results = []

for k_value in param_grid['k']:
    linear_reg = LinearRegression()

    selector = SelectKBest(f_classif, k=k_value)

    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)

    try:
        linear_reg.fit(X_train_selected, y_train)

        y_pred = linear_reg.predict(X_test_selected)

        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        mape = mean_absolute_percentage_error(y_test, y_pred) * 100
        percent_within_5 = np.sum(np.abs(y_test - y_pred) <= 0.05 * y_test.size) / y_test.size * 100

        selected_feature_indices = selector.get_support(indices=True)
        selected_feature_names = [all_feature_cols[i] for i in selected_feature_indices]

        results.append({
            'k': k_value,
            'Selected Features': selected_feature_names,
            'Mean Squared Error': mse,
            'Root Mean Squared Error': rmse,
            'Mean Absolute Error': mae,
            'R-squared': r2,
            'Mean Absolute Percentage Error': mape,
            'Percent Within 5%': percent_within_5
        })
    except AttributeError:
        pass  # Skip cases where SelectKBest causes attribute error

from itertools import combinations

all_metrics_for_k_20 = []

feature_combinations = list(combinations(all_feature_cols, 20))

for features in feature_combinations:
    linear_reg = LinearRegression()

    selector = SelectKBest(f_classif, k=20)

    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)

    linear_reg.fit(X_train_selected, y_train)

    y_pred = linear_reg.predict(X_test_selected)

    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred) * 100
    percent_within_5 = np.sum(np.abs(y_test - y_pred) <= 0.05 * y_test.size) / y_test.size * 100

    all_metrics_for_k_20.append({
        'Selected Features': features,
        'Mean Squared Error': mse,
        'Root Mean Squared Error': rmse,
        'Mean Absolute Error': mae,
        'R-squared': r2,
        'Mean Absolute Percentage Error': mape,
        'Percent Within 5%': percent_within_5
    })

optimal_result_for_k_20 = min(all_metrics_for_k_20, key=lambda x: x['Mean Absolute Error'])

print("Optimal Result for k=20:")
print("Selected Features:", optimal_result_for_k_20['Selected Features'])
print("Mean Squared Error:", optimal_result_for_k_20['Mean Squared Error'])
print("Root Mean Squared Error:", optimal_result_for_k_20['Root Mean Squared Error'])
print("Mean Absolute Error:", optimal_result_for_k_20['Mean Absolute Error'])
print("R-squared:", optimal_result_for_k_20['R-squared'])
print("Mean Absolute Percentage Error:", optimal_result_for_k_20['Mean Absolute Percentage Error'])
print("Percent Within 5%:", optimal_result_for_k_20['Percent Within 5%'])

