In [11]:
import pandas as pd
print(f'Pandas version: {pd.__version__}')
import sklearn
print(f'sklearn version: {sklearn.__version__}')

Pandas version: 2.0.3
sklearn version: 1.2.2


## Q1. Downloading the data
Download the data for January and February 2023.


In [2]:
url_january_data = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet'
df_january = pd.read_parquet(url_january_data)

In [3]:
df_january

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.30,1.00,0.5,0.00,0.0,1.0,14.30,2.5,0.00
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.10,1.0,N,43,237,1,7.90,1.00,0.5,4.00,0.0,1.0,16.90,2.5,0.00
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.90,1.00,0.5,15.00,0.0,1.0,34.90,2.5,0.00
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.90,1.0,N,138,7,1,12.10,7.25,0.5,0.00,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.40,1.00,0.5,3.28,0.0,1.0,19.68,2.5,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3066761,2,2023-01-31 23:58:34,2023-02-01 00:12:33,,3.05,,,107,48,0,15.80,0.00,0.5,3.96,0.0,1.0,23.76,,
3066762,2,2023-01-31 23:31:09,2023-01-31 23:50:36,,5.80,,,112,75,0,22.43,0.00,0.5,2.64,0.0,1.0,29.07,,
3066763,2,2023-01-31 23:01:05,2023-01-31 23:25:36,,4.67,,,114,239,0,17.61,0.00,0.5,5.32,0.0,1.0,26.93,,
3066764,2,2023-01-31 23:40:00,2023-01-31 23:53:00,,3.15,,,230,79,0,18.15,0.00,0.5,4.43,0.0,1.0,26.58,,


Q1 answer is 19 columns

## Q2. Computing duration
Now let's compute the __duration__ variable. It should contain the duration of a ride in minutes.
What's the standard deviation of the trips duration in January?

In [4]:
df_january.dtypes

VendorID                          int64
tpep_pickup_datetime     datetime64[us]
tpep_dropoff_datetime    datetime64[us]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                      int64
DOLocationID                      int64
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
airport_fee                     float64
dtype: object

In [6]:
import numpy as np

if 'tpep_pickup_datetime' in df_january.columns and 'tpep_dropoff_datetime' in df_january.columns:
    # Compute the duration in minutes
    df_january['duration'] = (df_january['tpep_dropoff_datetime'] - df_january['tpep_pickup_datetime']).dt.total_seconds() / 60

    # Compute the standard deviation of the duration
    std_dev_duration = round(np.std(df_january['duration']), 2)

    print("Standard Deviation in January 2023:", std_dev_duration)
else:
    print("The necessary columns are not present in the dataset.")


Standard Deviation in January 2023: 42.59


## Q3. Dropping outliers

In [7]:
# Calculate trip duration in minutes and add it as a new column
df_january['duration'] = (
    (df_january['tpep_dropoff_datetime'] - df_january['tpep_pickup_datetime'])
    .dt.total_seconds() / 60
)

# Define duration range in minutes
min_duration = 1
max_duration = 60

# Filter records with duration within the specified range
df_filtered = df_january[
    (df_january['duration'] >= min_duration) &
    (df_january['duration'] <= max_duration)
]

# Calculate the fraction of records that meet the duration criteria
fraction_left = round(len(df_filtered) / len(df_january), 2)
fraction_left

0.98

In [8]:
print(f"Fraction left in January 2023 data is {fraction_left}")

Fraction left in January 2023 data is 0.98


## Q4. One-hot Encoding

In [9]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer

def ensure_integer_columns(df, columns):
    """Ensure specified columns are of integer type."""
    for column in columns:
        df[column] = df[column].astype(int)
    return df

def convert_columns_to_string(df, columns):
    """Convert specified columns to string type in a copy of the DataFrame."""
    df_copy = df.copy()
    for column in columns:
        df_copy[column] = df_copy[column].astype(str)
    return df_copy

def dataframe_to_dicts(df, columns):
    """Convert specified columns of a DataFrame to a list of dictionaries."""
    return df[columns].to_dict(orient='records')

def vectorize_dicts(dicts):
    """Fit a DictVectorizer on a list of dictionaries and return the feature matrix."""
    dv = DictVectorizer()
    X = dv.fit_transform(dicts)
    return X, dv

# Assume df_filtered is already defined and filtered

# Ensure the columns 'PULocationID' and 'DOLocationID' are of integer type
columns_to_convert = ['PULocationID', 'DOLocationID']
df_filtered = ensure_integer_columns(df_filtered, columns_to_convert)

# Create a copy of the DataFrame and convert IDs to strings
df_filtered_str = convert_columns_to_string(df_filtered, columns_to_convert)

# Convert the DataFrame to a list of dictionaries
dicts = dataframe_to_dicts(df_filtered_str, columns_to_convert)

# Fit a DictVectorizer and get the feature matrix
X, dv = vectorize_dicts(dicts)

# Print the dimensionality of the feature matrix
print("Dimensionality of the feature matrix (number of columns):", X.shape[1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].astype(int)


Dimensionality of the feature matrix (number of columns): 515


## Q5. Training model

In [12]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction import DictVectorizer

def prepare_feature_matrix(dicts):
    """Fit a DictVectorizer on a list of dictionaries and return the feature matrix and vectorizer."""
    dv = DictVectorizer()
    X = dv.fit_transform(dicts)
    return X, dv

def train_linear_regression_model(X, y):
    """Train a linear regression model on the provided features and target variable."""
    lr = LinearRegression()
    lr.fit(X, y)
    return lr

def calculate_rmse(y_true, y_pred):
    """Calculate the Root Mean Squared Error (RMSE) between the true and predicted values."""
    return round(np.sqrt(mean_squared_error(y_true, y_pred)), 2)

# Assume df_filtered and dicts are already defined and filtered

# Prepare the feature matrix using the dictionary vectorizer
X_train, dv = prepare_feature_matrix(dicts)

# Prepare the target variable
y_train = df_filtered['duration'].values

# Train the linear regression model
lr = train_linear_regression_model(X_train, y_train)

# Make predictions on the training data
y_pred = lr.predict(X_train)

# Calculate the RMSE on the training data
rmse = calculate_rmse(y_train, y_pred)
print("RMSE on training data:", rmse)

RMSE on training data: 7.65


In [13]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

def load_data(url):
    """Load data from the specified URL."""
    return pd.read_parquet(url)

def compute_duration(df):
    """Compute the trip duration in minutes and filter valid records."""
    df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
    return df[(df['duration'] >= 1) & (df['duration'] <= 60)].copy()

def prepare_features(df):
    """Prepare features for the model by converting IDs to strings and creating dictionaries."""
    df['PULocationID'] = df['PULocationID'].astype('object').astype(str)
    df['DOLocationID'] = df['DOLocationID'].astype('object').astype(str)
    return df[['PULocationID', 'DOLocationID']].to_dict(orient='records')

def fit_vectorizer(dicts):
    """Fit a dictionary vectorizer and transform the data."""
    dv = DictVectorizer()
    X = dv.fit_transform(dicts)
    return X, dv

def transform_features(dv, dicts):
    """Transform features using the fitted dictionary vectorizer."""
    return dv.transform(dicts)

def train_model(X, y):
    """Train a linear regression model."""
    lr = LinearRegression()
    lr.fit(X, y)
    return lr

def calculate_rmse(y_true, y_pred):
    """Calculate the Root Mean Squared Error (RMSE)."""
    return round(np.sqrt(mean_squared_error(y_true, y_pred)), 2)

In [15]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

def load_data(url):
    """Load the parquet file from the specified URL."""
    return pd.read_parquet(url)

def compute_duration(df):
    """Compute the trip duration in minutes."""
    df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
    return df

def filter_duration(df, min_duration=1, max_duration=60):
    """Filter records to keep only those with duration between min_duration and max_duration (inclusive)."""
    return df[(df['duration'] >= min_duration) & (df['duration'] <= max_duration)].copy()

def convert_columns_to_string(df, columns):
    """Convert specified columns to string type."""
    for column in columns:
        df[column] = df[column].astype('object').astype(str)
    return df

def dataframe_to_dicts(df, columns):
    """Convert specified columns of a DataFrame to a list of dictionaries."""
    return df[columns].to_dict(orient='records')

def prepare_feature_matrix(dicts):
    """Fit a DictVectorizer on a list of dictionaries and return the feature matrix and vectorizer."""
    dv = DictVectorizer()
    X = dv.fit_transform(dicts)
    return X, dv

def transform_feature_matrix(dv, dicts):
    """Transform a list of dictionaries using an existing DictVectorizer."""
    return dv.transform(dicts)

def train_linear_regression_model(X, y):
    """Train a linear regression model on the provided features and target variable."""
    lr = LinearRegression()
    lr.fit(X, y)
    return lr

def calculate_rmse(y_true, y_pred):
    """Calculate the Root Mean Squared Error (RMSE) between the true and predicted values."""
    return round(np.sqrt(mean_squared_error(y_true, y_pred)), 2)

# URLs for the data
url_january = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet'
url_february = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet'

# Process January data
df_january = load_data(url_january)
df_january = compute_duration(df_january)
df_filtered = filter_duration(df_january)
df_filtered = convert_columns_to_string(df_filtered, ['PULocationID', 'DOLocationID'])
dicts_january = dataframe_to_dicts(df_filtered, ['PULocationID', 'DOLocationID'])
X_train, dv = prepare_feature_matrix(dicts_january)
y_train = df_filtered['duration'].values

# Train the linear regression model
lr = train_linear_regression_model(X_train, y_train)

# Process February data
df_february = load_data(url_february)
df_february = compute_duration(df_february)
df_feb_filtered = filter_duration(df_february)
df_feb_filtered = convert_columns_to_string(df_feb_filtered, ['PULocationID', 'DOLocationID'])
dicts_february = dataframe_to_dicts(df_feb_filtered, ['PULocationID', 'DOLocationID'])
X_val = transform_feature_matrix(dv, dicts_february)
y_val = df_feb_filtered['duration'].values

# Make predictions on the validation data
y_pred_val = lr.predict(X_val)

# Calculate the RMSE on the validation data
rmse_val = calculate_rmse(y_val, y_pred_val)
print("RMSE on validation data:", rmse_val)


RMSE on validation data: 7.81
