In [1]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [2]:
!pip install pyarrow



In [3]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)
    
    df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
    df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
    df['duration'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
    
    # Convert to minutes
    df['duration'] = df['duration'].apply(lambda x: x.total_seconds() / 60)
    
#     df = df.query("duration >= 1 & duration <= 60")
    
    return df

In [4]:
jan_2023_data = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')

In [5]:
feb_2023_data = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')

In [6]:
def data_processing(df):
    
    df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
    df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
    df['duration'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
    
    # Convert to minutes
    df['duration'] = df['duration'].apply(lambda x: x.total_seconds() / 60)
    
    df_new = df.query("duration >= 1 & duration <= 60")
    
    return df_new

In [7]:
df_train = data_processing(jan_2023_data)
df_val = data_processing(feb_2023_data)

### 1. Read the data for January. How many columns are there?

In [8]:
jan_2023_data.drop("duration", axis=1).shape[1]

19

### 2. What's the standard deviation of the trips duration in January?

In [9]:
jan_2023_data['duration'].std()

42.594351241920904

### 3. What fraction of the records left after you dropped the outliers?

In [10]:
round((df_train.shape[0] / jan_2023_data.shape[0]) * 100, 2)

98.12

### 4. What's the dimensionality of this matrix (number of columns)?

In [11]:
categorical = ['PULocationID', 'DOLocationID']

In [12]:
df_train[categorical] = df_train[categorical].astype(str)
df_val[categorical] = df_val[categorical].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train[categorical] = df_train[categorical].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_val[categorical] = df_val[categorical].astype(str)


In [13]:
dv = DictVectorizer()

In [14]:
train_dicts = df_train[categorical].to_dict(orient='records')

X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [15]:
X_train.shape

(3009173, 515)

In [16]:
X_val.shape

(2855951, 515)

### 5. What's the RMSE on train?

In [17]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [18]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_train = lr.predict(X_train)

mean_squared_error(y_train, y_pred_train, squared=False)

7.6492610279057605

### 6. What's the RMSE on validation?

In [19]:
y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)

7.81183265470218