In [1]:
# linear regression model
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load data
data = pd.read_csv('train_prices_combined.csv')

# Select features and target variable
features = ['train_number', 'basefare', 'reservationCharge', 'superfastCharge', 'dynamicFare', 'cateringCharge', 
            'fuelAmount', 'totalConcession', 'tatkalFare', 'serviceTax', 'otherCharge', 'distance', 'total_minutes',
            '1A', '2A', '2S', '3A', '3E', 'CC', 'SL']
target = 'totalFare'

# Split data into features and target variable
X = data[features]
y = data[target]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Model evaluation
print('Coefficients: \n', model.coef_)
print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
print('Coefficient of determination (R^2): %.2f%%' % (r2_score(y_test, y_pred) * 100))


Coefficients: 
 [ 2.77555756e-16  1.00000000e+00  9.99328183e-01  1.00000000e+00
 -2.22044605e-16 -2.32452946e-16 -2.77555756e-17 -1.38777878e-17
  0.00000000e+00  1.00000000e+00  0.00000000e+00  0.00000000e+00
  1.56243392e-14  1.51158885e-02  8.39771582e-03 -1.51158885e-02
  1.67954316e-03  1.67954316e-03  0.00000000e+00 -1.17568021e-02]
Mean squared error: 0.00
Coefficient of determination (R^2): 100.00%


In [2]:
# decision tree model
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load data
data = pd.read_csv('train_prices_combined.csv')

# Select features and target variable
features = ['train_number', 'basefare', 'reservationCharge', 'superfastCharge', 'dynamicFare', 'cateringCharge', 
            'fuelAmount', 'totalConcession', 'tatkalFare', 'serviceTax', 'otherCharge', 'distance', 'total_minutes',
            '1A', '2A', '2S', '3A', '3E', 'CC', 'SL']
target = 'totalFare'

# Split data into features and target variable
X = data[features]
y = data[target]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model = DecisionTreeRegressor(random_state=0)
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Model evaluation
print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
print('Coefficient of determination (R^2): %.2f%%' % (r2_score(y_test, y_pred) * 100))


Mean squared error: 25855.00
Coefficient of determination (R^2): 90.64%


In [3]:
# random forest model
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load data
data = pd.read_csv('train_prices_combined.csv')

# Select features and target variable
features = ['train_number', 'basefare', 'reservationCharge', 'superfastCharge', 'dynamicFare', 'cateringCharge', 
            'fuelAmount', 'totalConcession', 'tatkalFare', 'serviceTax', 'otherCharge', 'distance', 'total_minutes',
            '1A', '2A', '2S', '3A', '3E', 'CC', 'SL']
target = 'totalFare'

# Split data into features and target variable
X = data[features]
y = data[target]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Model evaluation
print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
print('Coefficient of determination (R^2): %.2f%%' % (r2_score(y_test, y_pred) * 100))


Mean squared error: 16414.69
Coefficient of determination (R^2): 94.06%


In [4]:
# gradient boosting model
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load data
data = pd.read_csv('train_prices_combined.csv')

# Select features and target variable
features = ['train_number', 'basefare', 'reservationCharge', 'superfastCharge', 'dynamicFare', 'cateringCharge', 
            'fuelAmount', 'totalConcession', 'tatkalFare', 'serviceTax', 'otherCharge', 'distance', 'total_minutes',
            '1A', '2A', '2S', '3A', '3E', 'CC', 'SL']
target = 'totalFare'

# Split data into features and target variable
X = data[features]
y = data[target]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0)
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Model evaluation
print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
print('Coefficient of determination (R^2): %.2f%%' % (r2_score(y_test, y_pred) * 100))


Mean squared error: 1304.33
Coefficient of determination (R^2): 99.53%


In [2]:
# neural network model
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Load data
data = pd.read_csv('train_prices_combined.csv')

# Select features and target variable
features = ['train_number', 'basefare', 'reservationCharge', 'superfastCharge', 'dynamicFare', 'cateringCharge', 
            'fuelAmount', 'totalConcession', 'tatkalFare', 'serviceTax', 'otherCharge', 'distance', 'total_minutes',
            '1A', '2A', '2S', '3A', '3E', 'CC', 'SL']
target = 'totalFare'

# Split data into features and target variable
X = data[features]
y = data[target]

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

model.compile(optimizer='adam', loss='mean_squared_error')

model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)

# Make predictions on the testing set
y_pred = model.predict(X_test).flatten()

# Model evaluation
print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
print('Coefficient of determination (R^2): %.2f%%' % (r2_score(y_test, y_pred) * 100))


Mean squared error: 587955.49
Coefficient of determination (R^2): -112.90%
