In [1]:
import pandas as pd
import polars as pl
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
pd.set_option('display.width', 150)
pd.set_option('future.no_silent_downcasting', True)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
sns.set()

In [2]:
dates = ['session_start', 'session_end', 'session_date', 'order_dt']
df = pd.read_csv('ecom_go_2.csv', parse_dates=dates)
df['week'] = pd.to_datetime(df['session_date']).dt.isocalendar().week
df_pl = pl.from_pandas(df)

print(df.shape)
print(df.columns.tolist())

(1009, 19)
['user_id', 'region', 'device', 'channel', 'session_start', 'session_end', 'sessiondurationsec', 'session_date', 'month', 'day', 'hour_of_day', 'order_dt', 'revenue', 'payment_type', 'promo_code', 'final_price', 'time_of_day', 'payer', 'week']


In [3]:
df_unchanged = df.copy()
print(df_unchanged.shape, df_unchanged['payer'].sum())

df_replaced = df.copy()
df_replaced['revenue'] = np.where(df_replaced['revenue'] == 100000, 9999, df_replaced['revenue'])
df_replaced['revenue'] = np.where(df_replaced['revenue'] == 1, 4999, df_replaced['revenue'])
print(df_replaced.shape, df_replaced['payer'].sum())

df_removed = df.copy()
to_remove = df[df['revenue'].isin([1, 100000])].index
df_removed.drop(to_remove, inplace=True)
print(df_removed.shape, df_removed['payer'].sum())


dfs = [(df_unchanged, "ORIGINAL DATAFRAME:"), 
       (df_replaced, "REPLACED DATAFRAME:"), 
       (df_removed, "REMOVED DATAFRAME:")]

(1009, 19) 282
(1009, 19) 282
(1003, 19) 276


#  Regression

In [8]:
print(df.head())

        user_id         region  device          channel       session_start         session_end  sessiondurationsec session_date  month  day  \
0  529697267522  United States  iPhone  социальные сети 2019-05-01 00:06:40 2019-05-01 00:07:06                  26   2019-05-01      5    3   
1  601292388085  United States      PC          organic 2019-05-01 06:56:16 2019-05-01 07:09:18                 782   2019-05-01      5    3   
2  852898876338  United States     Mac  социальные сети 2019-05-01 04:30:45 2019-05-01 04:34:56                 251   2019-05-01      5    3   
3  998513020664  United States  iPhone  социальные сети 2019-05-01 18:53:42 2019-05-01 18:57:35                 233   2019-05-01      5    3   
4  240702200943  United States     Mac  социальные сети 2019-05-02 14:04:32 2019-05-02 14:09:51                 319   2019-05-02      5    4   

   hour_of_day            order_dt  revenue     payment_type  promo_code  final_price time_of_day  payer  week  
0            0 2019-05

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

def preprocess_data(df):
    # Create a copy of the dataframe
    data = df.copy()
    
    # Select relevant features
    features = [
        'region', 'device', 'channel', 
        'sessiondurationsec', 'hour_of_day', 
        'week', 'time_of_day'
    ]
    
    # Prepare the feature matrix
    X = data[features].copy()
    y = data['payer']
    
    # Label Encoding for categorical variables
    categorical_cols = ['region', 'device', 'channel', 'time_of_day']
    label_encoders = {}
    
    for col in categorical_cols:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
        label_encoders[col] = le
    
    # Handle missing values
    X = X.fillna(X.median())
    
    return X, y, label_encoders

def train_balanced_logistic_regression(X, y):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Calculate class weights
    class_weights = {0: 1, 1: sum(y_train == 0) / sum(y_train == 1)}
    
    # Train Balanced Logistic Regression
    model = LogisticRegression(
        class_weight=class_weights,  # Manual class weights
        max_iter=1000, 
        random_state=42
    )
    model.fit(X_train_scaled, y_train)
    
    # Predictions
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    
    # Evaluation
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    print("\nAccuracy Score:")
    print(accuracy_score(y_test, y_pred))
    
    # Feature Importance
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': np.abs(model.coef_[0])
    }).sort_values('importance', ascending=False)
    
    print("\nFeature Importance:")
    print(feature_importance)
    
    return model, scaler, feature_importance

def main(df):
    # Preprocess the data
    X, y, label_encoders = preprocess_data(df)
    
    # Train the model
    model, scaler, feature_importance = train_balanced_logistic_regression(X, y)
    
    return {
        'model': model,
        'scaler': scaler,
        'feature_importance': feature_importance,
        'label_encoders': label_encoders
    }

# Usage: results = main(df)
print("Balanced Logistic Regression Model is ready!")

Balanced Logistic Regression Model is ready!


In [13]:
results = main(df)

Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.55      0.62       146
           1       0.26      0.41      0.32        56

    accuracy                           0.51       202
   macro avg       0.48      0.48      0.47       202
weighted avg       0.58      0.51      0.53       202


Confusion Matrix:
[[80 66]
 [33 23]]

Accuracy Score:
0.5099009900990099

Feature Importance:
              feature  importance
4         hour_of_day    0.204821
6         time_of_day    0.108922
0              region    0.100870
1              device    0.097819
5                week    0.095031
3  sessiondurationsec    0.061651
2             channel    0.044703


In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data (assuming you'll replace this with your actual data loading)
# df = pd.read_csv('your_data.csv')

def preprocess_data(df):
    """
    Preprocess the dataframe for logistic regression
    
    Args:
        df (pd.DataFrame): Input dataframe
    
    Returns:
        pd.DataFrame: Preprocessed dataframe
    """
    # Create a copy to avoid modifying the original dataframe
    data = df.copy()
    
    # Feature selection based on potential relevance
    selected_features = [
        'region', 
        'device', 
        'channel', 
        'sessiondurationsec', 
        'month', 
        'day', 
        'hour_of_day', 
        'time_of_day'
    ]
    
    # Preprocessing
    # 1. Categorical Encoding
    categorical_features = ['region', 'device', 'channel', 'time_of_day']
    
    # Label Encoding for categorical features
    le = LabelEncoder()
    for feature in categorical_features:
        data[feature] = le.fit_transform(data[feature].astype(str))
    
    # 2. Handle missing values
    # Replace NaN with median for numeric features
    numeric_features = ['sessiondurationsec', 'month', 'day', 'hour_of_day']
    for feature in numeric_features:
        data[feature].fillna(data[feature].median(), inplace=True)
    
    # 3. Prepare features and target
    X = data[selected_features]
    y = data['payer']
    
    return X, y

def train_logistic_regression(X, y):
    """
    Train logistic regression model
    
    Args:
        X (pd.DataFrame): Features
        y (pd.Series): Target variable
    
    Returns:
        tuple: Trained model, scaler, classification report, feature importances
    """
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train logistic regression
    lr = LogisticRegression(random_state=42)
    lr.fit(X_train_scaled, y_train)
    
    # Predictions
    y_pred = lr.predict(X_test_scaled)
    
    # Classification report
    report = classification_report(y_test, y_pred)
    
    # Feature importances (absolute values of coefficients)
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': np.abs(lr.coef_[0])
    }).sort_values('importance', ascending=False)
    
    return lr, scaler, report, feature_importance

def visualize_feature_importance(feature_importance):
    """
    Create a bar plot of feature importances
    
    Args:
        feature_importance (pd.DataFrame): DataFrame with feature importances
    """
    plt.figure(figsize=(10, 6))
    sns.barplot(x='importance', y='feature', data=feature_importance)
    plt.title('Feature Importances in Logistic Regression')
    plt.xlabel('Absolute Coefficient Value')
    plt.ylabel('Features')
    plt.tight_layout()
    plt.show()

# Main execution
def main(df):
    # Preprocess the data
    X, y = preprocess_data(df)
    
    # Train the model
    model, scaler, report, feature_importance = train_logistic_regression(X, y)
    
    # Print results
    print("Classification Report:")
    print(report)
    
    print("\nFeature Importances:")
    print(feature_importance)
    
    # Visualize feature importances
    # visualize_feature_importance(feature_importance)
    
    return model, scaler

# Example usage
model, scaler = main(df)

Classification Report:
              precision    recall  f1-score   support

           0       0.71      1.00      0.83       143
           1       0.00      0.00      0.00        59

    accuracy                           0.71       202
   macro avg       0.35      0.50      0.41       202
weighted avg       0.50      0.71      0.59       202


Feature Importances:
              feature  importance
7         time_of_day    0.174561
6         hour_of_day    0.143639
2             channel    0.137303
3  sessiondurationsec    0.113514
1              device    0.110980
0              region    0.061888
4               month    0.041654
5                 day    0.028312


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[feature].fillna(data[feature].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[feature].fillna(data[feature].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on whi

In [20]:
model, scaler = main(df_removed)

Classification Report:
              precision    recall  f1-score   support

           0       0.69      1.00      0.82       139
           1       0.00      0.00      0.00        62

    accuracy                           0.69       201
   macro avg       0.35      0.50      0.41       201
weighted avg       0.48      0.69      0.57       201


Feature Importances:
              feature  importance
1              device    0.150352
3  sessiondurationsec    0.147958
7         time_of_day    0.099313
6         hour_of_day    0.081743
0              region    0.076930
4               month    0.024461
2             channel    0.006443
5                 day    0.005996


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[feature].fillna(data[feature].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[feature].fillna(data[feature].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on whi

In [22]:
(df['order_dt'] - df['session_start']).dt.total_seconds().unique()

array([ 0.00000e+00,          nan,  5.40000e+01,  2.00000e+00,
        6.00000e+00,  1.58000e+02,  5.00000e+00,  1.20000e+01,
        1.70000e+01,  3.00000e+00,  3.50000e+01,  2.80000e+01,
        1.80000e+01,  4.00000e+00,  7.00000e+00,  6.90000e+01,
        1.60000e+01, -1.48503e+05,  1.29000e+02,  1.50000e+01,
        7.50000e+01,  1.10000e+01,  2.00000e+01,  9.00000e+00,
        8.00000e+00,  1.30000e+01,  2.10000e+01,  4.90000e+01,
        3.30000e+01,  4.50000e+01,  6.80000e+01,  2.20000e+01,
        7.30000e+01,  4.40000e+01,  5.60000e+01,  3.80000e+01,
        8.70000e+01,  4.00000e+01,  6.40000e+01,  3.70000e+01,
        7.20000e+01,  2.40000e+01,  2.90000e+01,  3.10000e+01,
        3.90000e+01,  2.30000e+01,  3.40000e+01,  4.10000e+01,
        1.90000e+01,  7.40000e+01])

In [25]:
df[df['order_dt'].notna()].head(30)[['session_start', 'order_dt']]

Unnamed: 0,session_start,order_dt
0,2019-05-01 00:06:40,2019-05-01 00:06:40
20,2019-05-06 15:49:16,2019-05-06 15:49:16
28,2019-05-07 12:00:49,2019-05-07 12:01:43
33,2019-05-09 16:40:44,2019-05-09 16:40:44
34,2019-05-09 09:23:22,2019-05-09 09:23:24
45,2019-05-12 07:50:12,2019-05-12 07:50:12
49,2019-05-13 22:00:41,2019-05-13 22:00:41
53,2019-05-14 06:32:03,2019-05-14 06:32:03
55,2019-05-15 23:16:53,2019-05-15 23:16:53
56,2019-05-15 15:52:06,2019-05-15 15:52:06
