In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from pymongo.mongo_client import MongoClient
import json

In [2]:
uri = "mongodb+srv://akshar1895:Aksharsdata@cluster0.bdyjsdd.mongodb.net/?retryWrites=true&w=majority"
# Create a new client and connect to the server
client = MongoClient(uri)
# Specify the database and collection
database_name = 'automatidata'
collection_name = 'nyctaxi'
collection = client[database_name][collection_name]

# Retrieve the data from the MongoDB collection
cursor = collection.find()

# Convert the MongoDB cursor to a list of dictionaries
data_list = list(cursor)

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(data_list)

# drop "_id"
if '_id' in df.columns:
    df =  df.drop('_id', axis=1)
# Close the MongoDB connection
client.close()


In [3]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,mean_duration,mean_distance,predicted_fare,tip_percent,generous
0,2,08/17/2017 4:06:26 AM,08/17/2017 4:06:29 AM,4,0.0,5,N,100,100,1,...,0.5,0.0,0.0,0.3,99.3,3.130556,0.253333,4.374211,0.0,0
1,2,01/29/2017 1:32:15 AM,01/29/2017 1:58:00 AM,1,5.42,1,N,249,262,1,...,0.5,4.66,0.0,0.3,27.96,22.016667,5.245,19.522507,0.2,1
2,2,01/28/2017 1:40:19 PM,01/28/2017 1:47:11 PM,1,1.99,1,N,239,166,1,...,0.5,1.5,0.0,0.3,10.3,12.984848,1.995909,10.52373,0.17,0
3,2,03/17/2017 8:59:59 AM,03/17/2017 9:19:42 AM,1,0.98,1,N,48,246,1,...,0.5,2.66,0.0,0.3,15.96,8.928455,1.305122,8.257137,0.2,1
4,2,03/22/2017 9:15:51 AM,03/22/2017 9:50:17 AM,6,5.02,1,N,239,211,1,...,0.5,3.0,0.0,0.3,26.8,34.433333,5.02,22.891725,0.126,0


In [4]:
df.isna().sum()

VendorID                 0
tpep_pickup_datetime     0
tpep_dropoff_datetime    0
passenger_count          0
trip_distance            0
RatecodeID               0
store_and_fwd_flag       0
PULocationID             0
DOLocationID             0
payment_type             0
fare_amount              0
extra                    0
mta_tax                  0
tip_amount               0
tolls_amount             0
improvement_surcharge    0
total_amount             0
mean_duration            0
mean_distance            0
predicted_fare           0
tip_percent              3
generous                 0
dtype: int64

In [5]:
df = df.dropna()

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder

In [7]:
def generate_features( input_data: pd.DataFrame) -> pd.DataFrame:

    # Convert pickup and dropoff cols to datetime
    input_data['tpep_pickup_datetime'] = pd.to_datetime(input_data['tpep_pickup_datetime'], format='%m/%d/%Y %I:%M:%S %p')
    input_data['tpep_dropoff_datetime'] = pd.to_datetime(input_data['tpep_dropoff_datetime'], format='%m/%d/%Y %I:%M:%S %p')
    #create month
    input_data['month'] = input_data['tpep_pickup_datetime'].dt.strftime('%b').str.lower()
    # create day col
    input_data['day'] = input_data['tpep_pickup_datetime'].dt.day_name().str.lower()
    # create time of the day
    input_data['am_rush'] = input_data['tpep_pickup_datetime'].dt.hour
    input_data['day_time'] = input_data['tpep_pickup_datetime'].dt.hour
    input_data['pm_rush'] = input_data['tpep_pickup_datetime'].dt.hour
    input_data['night time'] = input_data['tpep_pickup_datetime'].dt.hour

    input_data['am_rush'] = input_data['am_rush'].apply(lambda x: 1 if 6 <= x < 10 else 0)
    input_data['day_time'] = input_data['am_rush'].apply(lambda x: 1 if 10 <= x < 16 else 0)
    input_data['pm_rush'] = input_data['am_rush'].apply(lambda x: 1 if 16<= x < 20 else 0)
    input_data['night_time'] = input_data['am_rush'].apply(lambda x : 1 if (20 <= x < 24) or (0 <= x < 6) else 0)

    # drop redundant columns
    drop_cols = ['tpep_pickup_datetime', 'tpep_dropoff_datetime',
            'payment_type', 'trip_distance', 'store_and_fwd_flag',
            'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
            'improvement_surcharge', 'total_amount', 'tip_percent']
    
    # convert catergorical features to string
    cols_to_str = ['RatecodeID', 'VendorID', 'DOLocationID', 'PULocationID']

    # Convert each column to string
    for col in cols_to_str:
        input_data[col] = input_data[col].astype('str')

    input_data = input_data.drop(columns=drop_cols, axis=1)

    return input_data

In [8]:
df_n = generate_features(df)

In [9]:
df_n.head()

Unnamed: 0,VendorID,passenger_count,RatecodeID,PULocationID,DOLocationID,mean_duration,mean_distance,predicted_fare,generous,month,day,am_rush,day_time,pm_rush,night time,night_time
0,2,4,5,100,100,3.130556,0.253333,4.374211,0,aug,thursday,0,0,0,4,1
1,2,1,1,249,262,22.016667,5.245,19.522507,1,jan,sunday,0,0,0,1,1
2,2,1,1,239,166,12.984848,1.995909,10.52373,0,jan,saturday,0,0,0,13,1
3,2,1,1,48,246,8.928455,1.305122,8.257137,1,mar,friday,1,0,0,8,1
4,2,6,1,239,211,34.433333,5.02,22.891725,0,mar,wednesday,1,0,0,9,1


In [10]:
def get_data_transformer_object():

        try:
            
            numerical_columns = ['passenger_count', 'mean_duration', 'mean_distance', 'predicted_fare',
                                'am_rush', 'day_time', 'pm_rush', 'night_time']
            categorical_columns = ['VendorID','RatecodeID','PULocationID', 
                                   'DOLocationID','day', 'month']

            num_pipeline = Pipeline(
                steps = [
                    ('imputer', SimpleImputer(strategy='median')),
                    ('scaler', StandardScaler())
                ]
            )

            cat_pipeline = Pipeline(
                steps=[
                    ('imputer', SimpleImputer(strategy='most_frequent')),
                    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore', categories='auto')),
                    ('scaler', StandardScaler(with_mean=False) )
                ]
            )

            preprocessor = ColumnTransformer(
                [

                ('num_pipeline', num_pipeline, numerical_columns),
                ('cat_pipeline', cat_pipeline, categorical_columns)

                ]
            )
            
            return preprocessor
        except Exception as e:
            raise e

In [11]:
preproc_obj = get_data_transformer_object()

In [86]:
X = df_n.drop('generous', axis=1)
y = df_n[['generous']]

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [88]:
X_train.shape, y_train.shape

((10873, 15), (10873, 1))

In [67]:
X_train_pr = preproc_obj.fit_transform(X_train)
X_test_pr = preproc_obj.transform(X_test)

In [119]:
type(X_train_pr)

scipy.sparse._csr.csr_matrix

In [110]:
from scipy.sparse import hstack

# Concatenate sparse matrix with dense array using hstack
train_arr = hstack((X_train_pr, y_train.to_numpy()))

In [111]:
test_arr = hstack((X_test_pr, y_test.to_numpy()))

In [118]:
# Extract X_train_sparse and y_train_sparse
X_train_sparse = train_arr[:, :-1]
y_train_sparse = train_arr[:, -1]

# If needed, convert the sparse matrices to dense arrays
X_train_dense = X_train_sparse.toarray()
y_train_dense = y_train_sparse.toarray().flatten()

TypeError: 'coo_matrix' object is not subscriptable

In [112]:
x_train = train_arr[:,:-1]

TypeError: 'coo_matrix' object is not subscriptable

In [15]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
from sklearn.metrics import precision_score, f1_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay

In [57]:
rf = RandomForestClassifier(n_estimators=300, max_depth=5,
                            random_state=0, max_samples=0.4,
                            criterion='entropy',
                            max_features='sqrt', n_jobs=-1)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
score = f1_score(y_test, y_pred)
score

  return fit_method(estimator, *args, **kwargs)


0.7489051094890512

In [59]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[1054,  647],
       [ 385, 1539]], dtype=int64)