# FOML_ASSIGNMENT_4

    Name: ANTALA AVIRAJ (CS24MTECH14011)

###    Question 6)  Kaggle - Taxi Fare Price Prediction:

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from IPython.display import FileLink

In [None]:
# Random seed for reproducibility
SEED = 123
np.random.seed(SEED)

# Configuration for sampling and data types
CONFIG = {
    "sample_rate": 0.1,
    "column_types": {
        "fare_amount": "float32",
        "pickup_longitude": "float32",
        "pickup_latitude": "float32",
        "dropoff_longitude": "float32",
        "dropoff_latitude": "float32",
        "passenger_count": "float32",
    },
}

# Paths for dataset files
TRAIN_PATH = "/kaggle/input/new-york-city-taxi-fare-prediction/train.csv"
TEST_PATH = "/kaggle/input/new-york-city-taxi-fare-prediction/test.csv"

# Sampling logic for loading data
def load_sample_data(filepath, dtypes, fraction=1.0):
    return pd.read_csv(
        filepath,
        skiprows=lambda i: i > 0 and np.random.rand() > fraction,
        dtype=dtypes,
    ).dropna()

# Load datasets with sampling and types
train_df = load_sample_data(TRAIN_PATH, CONFIG["column_types"], CONFIG["sample_rate"])
test_df = pd.read_csv(TEST_PATH)

In [None]:
# Remove outliers from the dataset
def clean_data(data):
    conditions = [
        (data["fare_amount"].between(1, 500)),
        (data["pickup_longitude"].between(-75, -72)),
        (data["pickup_latitude"].between(40, 42)),
        (data["dropoff_longitude"].between(-75, -72)),
        (data["dropoff_latitude"].between(40, 42)),
        (data["passenger_count"].between(1, 6)),
    ]
    return data[np.logical_and.reduce(conditions)]

train_df = clean_data(train_df)

# Add datetime features
def extract_datetime_features(df, remove_original=True):
    if "pickup_datetime" in df:
        df["pickup_datetime"] = pd.to_datetime(df["pickup_datetime"], errors="coerce")
        df["year"] = df["pickup_datetime"].dt.year
        df["month"] = df["pickup_datetime"].dt.month
        df["day"] = df["pickup_datetime"].dt.day
        df["hour"] = df["pickup_datetime"].dt.hour
        if remove_original:
            df.drop(columns=["pickup_datetime"], inplace=True)

extract_datetime_features(train_df)
extract_datetime_features(test_df)

# Select only numeric columns
train_df = train_df.select_dtypes(include=["float32", "float64", "int32", "int64"])
test_df = test_df.select_dtypes(include=["float32", "float64", "int32", "int64"])

# Separate features and target variable
features = train_df.drop(columns=["fare_amount"])
target = train_df["fare_amount"]

In [None]:
# Train-test split
X_train, X_valid, y_train, y_valid = train_test_split(
    features, target, test_size=0.2, random_state=SEED
)

In [None]:
# Function to evaluate a model and compute RMSE
def evaluate_model(model, X_tr, y_tr, X_val, y_val):
    model.fit(X_tr, y_tr)
    train_preds = model.predict(X_tr)
    valid_preds = model.predict(X_val)
    train_rmse = mean_squared_error(y_tr, train_preds, squared=False)
    valid_rmse = mean_squared_error(y_val, valid_preds, squared=False)
    return train_rmse, valid_rmse

In [None]:
# Generate and save predictions for submission
def save_submission(model, test_data, template_path, output_path="submission.csv"):
    predictions = model.predict(test_data)
    submission = pd.read_csv(template_path)
    submission["fare_amount"] = predictions
    submission.to_csv(output_path, index=False)
    return FileLink(output_path)

In [None]:
# Train and evaluate models
models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(random_state=SEED, n_jobs=-1),
    "DecisionTree": DecisionTreeRegressor(random_state=SEED),
    "GradientBoosting": GradientBoostingRegressor(random_state=SEED),
    "XGBoost": XGBRegressor(
       objective="reg:squarederror", tree_method="hist", random_state=SEED, n_jobs=-1
    ),
}

In [None]:
# Template file path
SUBMISSION_TEMPLATE = "/kaggle/input/new-york-city-taxi-fare-prediction/sample_submission.csv"

# Train, evaluate, and create submission files
for name, model in models.items():
    train_rmse, val_rmse = evaluate_model(model, X_train, y_train, X_valid, y_valid)
    print(f"{name} - Training RMSE: {train_rmse:.2f}, Validation RMSE: {val_rmse:.2f}")
    
    # Save the submission as submission.csv for each model
    link = save_submission(model, test_df, SUBMISSION_TEMPLATE, output_path="submission.csv")
    display(link)



## Performance Scores Table:

| Model              | Training RMSE | Validation RMSE | Private Test RMSE (Score) |
|---------------------|---------------|------------------|---------------------------|
| LinearRegression    | 8.33          | 8.39             | 7.91451                   |
| XGBoost             | 3.92          | 4.06             | **3.34271**               |
| DecisionTree        | 0.05          | 5.48             | 7.11034                   |
| GradientBoosting    | 4.73          | 4.79             | 4.18123                   |
| RandomForest        | 1.41          | 3.83             | **3.30934**               |

## Top-2 Scoring Models
1. **RandomForest Regressor**: Private Test RMSE: **3.30934**  
   - Ensemble of decision trees that reduces overfitting by averaging predictions.

2. **XGBoost Regressor**: Private Test RMSE: **3.34271**  
   - Gradient boosting with regularization and optimized tree pruning for balanced performance.

## Analysis
- **RandomForest** excelled due to its ensemble approach, reducing variance and generalizing well.
- **XGBoost** performed well by iteratively optimizing weak learners and controlling complexity.
- Lower-performing models like DecisionTree overfit, while LinearRegression and GradientBoosting lacked the advanced optimization of XGBoost.
