# FeatureSelection&Prediction_CARSDMG


In [None]:
import pandas as pd

data = pd.read_csv('datawithTime.csv')
print(data['STATE '].unique())
illinois_data = data[data['STATE '] == 12]
illinois_count = (data['STATE '] == 12).sum()

total_records = len(data)

print(f"Number of records from 'Illinois': {illinois_count}")
print(f"Total number of records: {total_records}")
average_columns = ['CARS', 'TEMP', 'TRNSPD', 'TONS', 'POSITON1', 'HEADEND1', 'LOADF1', 'EMPTYF1', 'HIGHSPD', 'hour', 'minute']

median_columns = ['RAILROAD', 'YEAR', 'MONTH', 'DAY', 'VISIBLTY', 'WEATHER', 'TYPEQ', 'TRKCLAS', 'TYPTRK', 'CAUSE', 'ACCTRK']

averages = illinois_data[average_columns].mean()

medians = illinois_data[median_columns].median()

new_reported_values = pd.concat([averages, medians])

new_reported_values.index = average_columns + median_columns

new_reported_values = new_reported_values.transpose()
new_reported_values['STATE '] = 12

print(new_reported_values)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFECV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

data = pd.read_csv('datawithTime.csv')

feature_columns = ['CARS', 'TEMP', 'TRNSPD', 'TONS', 'POSITON1', 'HEADEND1', 'LOADF1', 'EMPTYF1', 'HIGHSPD', 'hour', 'minute', 'RAILROAD', 'YEAR', 'MONTH', 'DAY', 'STATE ', 'VISIBLTY', 'WEATHER', 'TYPEQ', 'TRKCLAS', 'TYPTRK', 'CAUSE', 'ACCTRK']
target_column = 'CARSDMG'

X = data[feature_columns]
y = data[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

selector = RFECV(LinearRegression(), cv=5, scoring='r2')
selector.fit(X_train, y_train)
selected_features = [feature_columns[i] for i in range(len(selector.support_)) if selector.support_[i]]

model = LinearRegression()

model.fit(X_train[selected_features], y_train)

y_pred = model.predict(X_test[selected_features])

r2 = r2_score(y_test, y_pred)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))

mae = mean_absolute_error(y_test, y_pred)

num_predictors = len(selected_features)

adj_r2 = 1 - (1 - r2) * (len(y_test) - 1) / (len(y_test) - num_predictors - 1)

print(f"Selected features: {', '.join(selected_features)}")
print(f"R-squared (R2) score: {r2:.4f}")
print(f"Adjusted R-squared: {adj_r2:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"Total selected features: {len(selected_features)}")
print(f"Coefficients for each selected feature:")
for feature, coef in zip(selected_features, model.coef_):
    print(f"{feature}: {coef:.4f}")


In [None]:
single_record_data = {
    'CARS':1.951162,
    'TEMP':  52.498578,
    'TRNSPD':  8.257903,
    'TONS':  2514.395384,
    'POSITON1':  16.865864,
    'HEADEND1':  1.489045,
    'LOADF1':  24.867202,
    'EMPTYF1':  13.788761,
    'HIGHSPD':  10.625021,
    'hour':  11.650109,
    'minute':  25.866700,
    'RAILROAD':  260,
    'YEAR':  2009,
    'MONTH':  6,
    'DAY':  16,
    'VISIBLTY': 2 ,
    'WEATHER': 1 ,
    'TYPEQ':  5,
    'TRKCLAS':  1,
    'TYPTRK':  2,
    'CAUSE':  226,
    'ACCTRK':  2,
    'STATE ': 12
}

single_record = pd.DataFrame([single_record_data])

single_record = single_record[selected_features].values.reshape(1, -1)

prediction = model.predict(single_record)

print(f"Predicted CARSDMG for the single 'Illinois' record: {prediction[0]:.4f}")



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import RFECV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

data = pd.read_csv('datawithTime.csv')

feature_columns = ['CARS', 'TEMP', 'TRNSPD', 'TONS', 'POSITON1', 'HEADEND1', 'LOADF1', 'EMPTYF1', 'HIGHSPD', 'hour', 'minute', 'RAILROAD', 'YEAR', 'MONTH', 'DAY', 'STATE ', 'VISIBLTY', 'WEATHER', 'TYPEQ', 'TRKCLAS', 'TYPTRK', 'CAUSE', 'ACCTRK']
target_column = 'CARSDMG'

X = data[feature_columns]
y = data[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = DecisionTreeRegressor()

selector = RFECV(model, cv=5, scoring='r2')
selector.fit(X_train, y_train)

selected_features = [feature_columns[i] for i in range(len(selector.support_)) if selector.support_[i]]

model.fit(X_train[selected_features], y_train)

y_pred = model.predict(X_test[selected_features])

r2 = r2_score(y_test, y_pred)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))

mae = mean_absolute_error(y_test, y_pred)

num_predictors = len(selected_features)

adj_r2 = 1 - (1 - r2) * (len(y_test) - 1) / (len(y_test) - num_predictors - 1)

print(f"Selected features: {', '.join(selected_features)}")
print(f"R-squared (R2) score: {r2:.4f}")
print(f"Adjusted R-squared: {adj_r2:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"Total selected features: {len(selected_features)}")


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFECV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

data = pd.read_csv('datawithTime.csv')

feature_columns = ['CARS', 'TEMP', 'TRNSPD', 'TONS', 'POSITON1', 'HEADEND1', 'LOADF1', 'EMPTYF1', 'HIGHSPD', 'hour', 'minute', 'RAILROAD', 'YEAR', 'MONTH', 'DAY', 'STATE ', 'VISIBLTY', 'WEATHER', 'TYPEQ', 'TRKCLAS', 'TYPTRK', 'CAUSE', 'ACCTRK']
target_column = 'CARSDMG'

X = data[feature_columns]
y = data[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor()

selector = RFECV(model, cv=5, scoring='r2')
selector.fit(X_train, y_train)

selected_features = [feature_columns[i] for i in range(len(selector.support_)) if selector.support_[i]]

model.fit(X_train[selected_features], y_train)

y_pred = model.predict(X_test[selected_features])

r2 = r2_score(y_test, y_pred)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))

mae = mean_absolute_error(y_test, y_pred)

print(f"Selected features: {', '.join(selected_features)}")
print(f"R-squared (R2) score: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"Total selected features: {len(selected_features)}")
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

rmse = np.sqrt(mean_squared_error(y_test, y_pred))

mae = mean_absolute_error(y_test, y_pred)

num_predictors = len(selected_features)

r2 = r2_score(y_test, y_pred)

adj_r2 = 1 - (1 - r2) * (len(y_test) - 1) / (len(y_test) - num_predictors - 1)

print(f"R-squared (R2) score: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"Adjusted R-squared: {adj_r2:.4f}")


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import RFECV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

data = pd.read_csv('datawithTime.csv')

feature_columns = ['CARS', 'TEMP', 'TRNSPD', 'TONS', 'POSITON1', 'HEADEND1', 'LOADF1', 'EMPTYF1', 'HIGHSPD', 'hour', 'minute', 'RAILROAD', 'YEAR', 'MONTH', 'DAY', 'STATE ', 'VISIBLTY', 'WEATHER', 'TYPEQ', 'TRKCLAS', 'TYPTRK', 'CAUSE', 'ACCTRK']
target_column = 'CARSDMG'

X = data[feature_columns]
y = data[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = GradientBoostingRegressor()

selector = RFECV(model, cv=5, scoring='r2')
selector.fit(X_train, y_train)

selected_features = [feature_columns[i] for i in range(len(selector.support_)) if selector.support_[i]]

model.fit(X_train[selected_features], y_train)

y_pred = model.predict(X_test[selected_features])

r2 = r2_score(y_test, y_pred)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))

mae = mean_absolute_error(y_test, y_pred)

num_predictors = len(selected_features)

adj_r2 = 1 - (1 - r2) * (len(y_test) - 1) / (len(y_test) - num_predictors - 1)

print(f"Selected features: {', '.join(selected_features)}")
print(f"R-squared (R2) score: {r2:.4f}")
print(f"Adjusted R-squared: {adj_r2:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"Total selected features: {len(selected_features)}")


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFECV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np
from sklearn.neighbors import KNeighborsRegressor

data = pd.read_csv('datawithTime.csv')

feature_columns = ['CARS', 'TEMP', 'TRNSPD', 'TONS', 'POSITON1', 'HEADEND1', 'LOADF1', 'EMPTYF1', 'HIGHSPD', 'hour', 'minute', 'RAILROAD', 'YEAR', 'MONTH', 'DAY', 'STATE ', 'VISIBLTY', 'WEATHER', 'TYPEQ', 'TRKCLAS', 'TYPTRK', 'CAUSE', 'ACCTRK']
target_column = 'CARSDMG'

X = data[feature_columns]
y = data[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

selector = RFECV(RandomForestRegressor(n_estimators=100, random_state=42), cv=5, scoring='r2')
selector.fit(X_train, y_train)
selected_features = [feature_columns[i] for i in range(len(selector.support_)) if selector.support_[i]]

model = KNeighborsRegressor(n_neighbors=10)

model.fit(X_train[selected_features], y_train)

y_pred = model.predict(X_test[selected_features])

r2 = r2_score(y_test, y_pred)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))

mae = mean_absolute_error(y_test, y_pred)

print(f"Selected features: {', '.join(selected_features)}")
print(f"R-squared (R2) score: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"Total selected features: {len(selected_features)}")


In [None]:
num_predictors = len(selected_features)

adj_r2 = 1 - (1 - r2) * (len(y_test) - 1) / (len(y_test) - num_predictors - 1)

print(f"Adjusted R-squared: {adj_r2:.4f}")
