In [49]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.compose import ColumnTransformer
from sklearn.metrics import precision_score, f1_score
from sklearn.metrics import accuracy_score, recall_score
%matplotlib inline

In [50]:
df = pd.read_csv('output.csv')
df.head(2)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,...,member_casual,file,month,day,year,day_of_week,start_hour,ride_length,ride_km,rider_speed
0,EC2DE40644C6B0F4,classic_bike,2022-05-23 23:06:58,2022-05-23 23:40:19,Wabash Ave & Grand Ave,TA1307000117,Halsted St & Roscoe St,TA1309000025,41.891466,-87.626761,...,member,202205-divvy-tripdata.csv,5,23,2022,Monday,23,33.35,6.3,0.0
1,1C31AD03897EE385,classic_bike,2022-05-11 08:53:28,2022-05-11 09:31:22,DuSable Lake Shore Dr & Monroe St,13300,Field Blvd & South Water St,15534,41.880958,-87.616743,...,member,202205-divvy-tripdata.csv,5,11,2022,Wednesday,8,37.9,0.61,0.0


Step 1 

initial preprocessing such as handing missing data and encoding categorical features

In [51]:
# remove missing data from 
df.dropna(subset=['ride_km', 'rider_speed'], axis=0, inplace=True)

In [52]:
# Convert target: 'member_casual' to binary format
df['is_member'] = (df['member_casual'] == 'member').astype(int)

In [53]:
# Select independent variables
categorical_features = ['rideable_type', 'day_of_week']
numeric_features = ['start_hour', 'ride_length', 'ride_km', 'rider_speed']

In [54]:
# encode categorical variables
# drop first = true to avoid multicollinearity

# One-hot encode 'rideable_type'
df = pd.get_dummies(df, columns=['rideable_type'], prefix='rideable', drop_first=True)

# One-hot encode 'day_of_week'
df = pd.get_dummies(df, columns=['day_of_week'], prefix='day', drop_first=True)

In [55]:
# Combine your categorical and numeric features
features = numeric_features + list(df.columns[df.columns.str.startswith('rideable_')]) + list(df.columns[df.columns.str.startswith('day_')])

Feature Elimination (RFE) to select only the most important features for training and prediction processes in the logistic regression model. The result should be a model that is just as effective but potentially simpler and more efficient than a model that uses all available features.

In [56]:
# Select independent variables
X = df[features]

# Select dependent variable
y = df['is_member']

# Split the data into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize logistic regression model with increased number of iterations
log_reg = LogisticRegression(max_iter=1000)

# Initialize scaler
scaler = StandardScaler()

# Create preprocessor to only scale numeric features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, numeric_features)],
    remainder='passthrough')

# Fit and transform X_train
X_train_scaled = preprocessor.fit_transform(X_train)

# Transform X_test
X_test_scaled = preprocessor.transform(X_test)

# Apply RFE for feature selection
rfe = RFE(estimator=log_reg, n_features_to_select=5)

rfe.fit(X_train_scaled, y_train)

# Get the features sorted by their rank
features_sorted_by_rank = sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), features))

# Select top features
top_features = [feature for rank, feature in features_sorted_by_rank if rank == 1]
top_features

['day_Saturday',
 'day_Sunday',
 'ride_length',
 'rideable_docked_bike',
 'rideable_electric_bike']

In [57]:
# Fit the model using top features
log_reg.fit(X_train_scaled[:, rfe.support_], y_train)

# Predict
y_pred = log_reg.predict(X_test_scaled[:, rfe.support_])

# Print classification report
print(classification_report(y_test, y_pred))

# Print confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
confusion_df = pd.DataFrame(conf_matrix, index=['Actual Casual', 'Actual Member'], columns=['Predicted Casual', 'Predicted Member'])
print(confusion_df)


              precision    recall  f1-score   support

           0       0.71      0.36      0.48    510082
           1       0.64      0.88      0.75    666997

    accuracy                           0.66   1177079
   macro avg       0.68      0.62      0.61   1177079
weighted avg       0.67      0.66      0.63   1177079

               Predicted Casual  Predicted Member
Actual Casual            184730            325352
Actual Member             76777            590220


In [58]:
# compare accuracy scores between train and test to check if there is over fitting

# Make predictions using the training data
y_train_pred = log_reg.predict(X_train_scaled[:, rfe.support_])

# Calculate performance metrics for the training data
train_accuracy = accuracy_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)

# Calculate performance metrics for the test data
test_accuracy = accuracy_score(y_test, y_pred)
test_precision = precision_score(y_test, y_pred)
test_recall = recall_score(y_test, y_pred)
test_f1 = f1_score(y_test, y_pred)

# Print the performance metrics
print(f"Training Accuracy: {train_accuracy}")
print(f"Training Precision: {train_precision}")
print(f"Training Recall: {train_recall}")
print(f"Training F1: {train_f1}")

print(f"\nTest Accuracy: {test_accuracy}")
print(f"Test Precision: {test_precision}")
print(f"Test Recall: {test_recall}")
print(f"Test F1: {test_f1}")


Training Accuracy: 0.6578013521618788
Training Precision: 0.6441237937977615
Training Recall: 0.8850641666469876
Training F1: 0.7456125780761531

Test Accuracy: 0.6583670254927664
Test Precision: 0.6446461883937036
Test Recall: 0.8848915362437912
Test F1: 0.7459011265859498


I am going to compare performance of the model trained on all the features to evaluate whether feature selection has improved the model performance or not.

In [59]:
# Select independent variables
X = df[features]

# Select dependent variable
y = df['is_member']

# Split the data into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize logistic regression model 
log_reg = LogisticRegression()

# Initialize scaler
scaler = StandardScaler()

# Create preprocessor to only scale numeric features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, numeric_features)],
    remainder='passthrough')

# Fit and transform X_train
X_train_scaled = preprocessor.fit_transform(X_train)

# Transform X_test
X_test_scaled = preprocessor.transform(X_test)

# Fit the model using top features
log_reg.fit(X_train_scaled, y_train)

# Predict
y_pred = log_reg.predict(X_test_scaled)
# Make predictions using the training data
y_train_pred = log_reg.predict(X_train_scaled)

# Print classification report
print(classification_report(y_test, y_pred))

# Print confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
confusion_df = pd.DataFrame(conf_matrix, index=['Actual Casual', 'Actual Member'], columns=['Predicted Casual', 'Predicted Member'])
print(confusion_df)

              precision    recall  f1-score   support

           0       0.71      0.38      0.50    510082
           1       0.65      0.88      0.75    666997

    accuracy                           0.66   1177079
   macro avg       0.68      0.63      0.62   1177079
weighted avg       0.68      0.66      0.64   1177079

               Predicted Casual  Predicted Member
Actual Casual            195719            314363
Actual Member             80139            586858


In [60]:
# compare accuracy scores between train and test to check if there is over fitting

# Calculate performance metrics for the training data
train_accuracy = accuracy_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)

# Calculate performance metrics for the test data
test_accuracy = accuracy_score(y_test, y_pred)
test_precision = precision_score(y_test, y_pred)
test_recall = recall_score(y_test, y_pred)
test_f1 = f1_score(y_test, y_pred)

# Print the performance metrics
print(f"Training Accuracy: {train_accuracy}")
print(f"Training Precision: {train_precision}")
print(f"Training Recall: {train_recall}")
print(f"Training F1: {train_f1}")

print(f"\nTest Accuracy: {test_accuracy}")
print(f"Test Precision: {test_precision}")
print(f"Test Recall: {test_recall}")
print(f"Test F1: {test_f1}")


Training Accuracy: 0.664235292818318
Training Precision: 0.6506272404910608
Training Recall: 0.8799247625119995
Training F1: 0.7481000566457584

Test Accuracy: 0.6648466245681046
Test Precision: 0.6511810088757364
Test Recall: 0.8798510338127458
Test F1: 0.7484393113712506


These results suggest that the model has decent performance, but there's room for improvement. The model appears to be better at identifying "member" instances (higher recall for class 1), but struggles more with identifying "casual" instances (lower recall for class 0).

A possible interpretation is that the model tends to over-predict the "member" class. 

There is an imbalance in class dataset where member = 3334816 and casual = 2550575. The recall for 'casual' users in the model was relatively low compared to 'member' users, which might be a sign that the model is being influenced by the class imbalance.

Another thing I could do is collect more informative features, or try different kinds of models, for example Random Forest.

Feature selection didn't change the performance.  

In [61]:
df['is_member'].value_counts()

is_member
1    3334816
0    2550575
Name: count, dtype: int64