# opt-CARSHZD-linearReg


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import mean_absolute_percentage_error

data = pd.read_csv('datawithTime.csv', nrows=int(1 * len(pd.read_csv('datawithTime.csv'))))

target_col = 'CARSHZD'

all_feature_cols = ['RAILROAD', 'YEAR', 'MONTH', 'DAY', 'CARS', 'STATE ', 'TEMP', 'VISIBLTY', 'WEATHER',
                    'TRNSPD', 'TONS', 'TYPEQ', 'TRKCLAS', 'TYPTRK', 'POSITON1',
                    'HEADEND1', 'LOADF1', 'EMPTYF1', 'CAUSE', 'ACCTRK',
                    'HIGHSPD', 'hour', 'minute']

X_train, X_test, y_train, y_test = train_test_split(data[all_feature_cols], data[target_col],
                                                    test_size=0.3, random_state=42)

param_grid = {'k': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]}

linear_reg = LinearRegression()

selector = SelectKBest(f_classif)

grid_search = GridSearchCV(selector, param_grid, scoring='neg_mean_absolute_error', cv=5)

grid_search.fit(X_train, y_train)

best_selector = grid_search.best_estimator_

X_train_selected = best_selector.transform(X_train)
X_test_selected = best_selector.transform(X_test)

linear_reg.fit(X_train_selected, y_train)

y_pred = linear_reg.predict(X_test_selected)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred) * 100
percent_within_5 = np.sum(np.abs(y_test - y_pred) <= 0.05 * y_test.size) / y_test.size * 100

print("Best Parameters:", grid_search.best_params_)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Error:", mae)
print("R-squared:", r2)
print("Mean Absolute Percentage Error:", mape)
print("Percent Within 5%:", percent_within_5)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from itertools import combinations

data = pd.read_csv('datawithTime.csv', nrows=int(1 * len(pd.read_csv('datawithTime.csv'))))

target_col = 'CARSHZD'

all_feature_cols = ['RAILROAD', 'YEAR', 'MONTH', 'DAY', 'CARS', 'STATE ', 'TEMP', 'VISIBLTY', 'WEATHER',
                    'TRNSPD', 'TONS', 'TYPEQ', 'TRKCLAS', 'TYPTRK', 'POSITON1',
                    'HEADEND1', 'LOADF1', 'EMPTYF1', 'CAUSE', 'ACCTRK',
                    'HIGHSPD', 'hour', 'minute']

X_train, X_test, y_train, y_test = train_test_split(data[all_feature_cols], data[target_col],
                                                    test_size=0.3, random_state=42)

linear_reg = LinearRegression()

optimal_solutions = {}

for num_features in range(1, len(all_feature_cols) + 1):
    optimal_mae = float('inf')
    optimal_features = []

    feature_combinations = combinations(all_feature_cols, num_features)

    for feature_subset in feature_combinations:
        X_train_subset = X_train[list(feature_subset)]
        X_test_subset = X_test[list(feature_subset)]

        linear_reg.fit(X_train_subset, y_train)

        y_pred = linear_reg.predict(X_test_subset)

        mae = mean_absolute_error(y_test, y_pred)

        if mae < optimal_mae:
            optimal_mae = mae
            optimal_features = feature_subset

    optimal_solutions[num_features] = {"MAE": optimal_mae, "Selected Features": optimal_features}

for num_features, solution in optimal_solutions.items():
    print(f"Number of Features: {num_features}")
    print(f"Optimal MAE: {solution['MAE']}")
    print(f"Optimal Features: {solution['Selected Features']}")
    print()
