In [1]:
import pandas as pd
import polars as pl
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
pd.set_option('display.width', 150)
pd.set_option('future.no_silent_downcasting', True)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
sns.set()

In [2]:
dates = ['session_start', 'session_end', 'session_date', 'order_dt']
df = pd.read_csv('ecom_go_2.csv', parse_dates=dates)
df['week'] = pd.to_datetime(df['session_date']).dt.isocalendar().week
df_pl = pl.from_pandas(df)

print(df.shape)
print(df.columns.tolist())

(1009, 19)
['user_id', 'region', 'device', 'channel', 'session_start', 'session_end', 'sessiondurationsec', 'session_date', 'month', 'day', 'hour_of_day', 'order_dt', 'revenue', 'payment_type', 'promo_code', 'final_price', 'time_of_day', 'payer', 'week']


In [3]:
df_unchanged = df.copy()
print(df_unchanged.shape, df_unchanged['payer'].sum())

df_replaced = df.copy()
df_replaced['revenue'] = np.where(df_replaced['revenue'] == 100000, 9999, df_replaced['revenue'])
df_replaced['revenue'] = np.where(df_replaced['revenue'] == 1, 4999, df_replaced['revenue'])
print(df_replaced.shape, df_replaced['payer'].sum())

df_removed = df.copy()
to_remove = df[df['revenue'].isin([1, 100000])].index
df_removed.drop(to_remove, inplace=True)
print(df_removed.shape, df_removed['payer'].sum())


dfs = [(df_unchanged, "ORIGINAL DATAFRAME:"), 
       (df_replaced, "REPLACED DATAFRAME:"), 
       (df_removed, "REMOVED DATAFRAME:")]

(1009, 19) 282
(1009, 19) 282
(1003, 19) 276


In [None]:
df.head()

In [None]:
df_replaced.info()

In [None]:
df.hour_of_day.unique()

In [None]:
df.time_of_day.unique()

# Predict Daily Revenue

In [None]:
channel_order = [
    'organic', 
    'email-рассылки', 
    'реклама у блогеров', 
    'социальные сети', 
    'контекстная реклама'
]

In [None]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

columns_to_keep = [
    'region', 'device', 'channel', 'payment_type',
    'is_weekend', 'day', 'month', 
    'hour_of_day', 'sessiondurationsec', 'time_to_purchase',
    'promo_code', 'payer' 
]

df['time_to_purchase'] = (df['order_dt'] - df['session_start']).dt.total_seconds().fillna(-1)
df['is_weekend'] = df['day'].isin([6, 7]).astype(int)
df['promo_code'] = df['promo_code'].fillna(0)
df['payment_type'] = df['payment_type'].fillna('No Purchase')

preprocessor = ColumnTransformer(
    transformers=[
        ('channel', OrdinalEncoder(categories=[channel_order]), ['channel']),
        
        ('nominal_cat', OneHotEncoder(handle_unknown='ignore'), [
            'region', 'device', 'payment_type'
        ]),
        
        ('numeric', StandardScaler(), [
            'is_weekend', 'day', 'month', 
            'hour_of_day', 'sessiondurationsec', 
            'time_to_purchase', 'promo_code', 'payer'
        ])
    ],
    remainder='passthrough'
)

X = df[columns_to_keep]
y = df['revenue']

# Transform features
X_transformed = preprocessor.fit_transform(X)

print(X_transformed.shape)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

# Create and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared Score:", r2)

# Determine the correct number of coefficients based on the preprocessor
def get_feature_names(column_transformer):
    feature_names = []
    
    # Channel (Ordinal Encoder)
    feature_names.extend([f'channel_{c}' for c in channel_order])
    
    # OneHotEncoder categories
    onehot = column_transformer.named_transformers_['nominal_cat']
    for i, cat_list in enumerate(onehot.categories_):
        feature_names.extend([f'{onehot.feature_names_in_[i]}_{c}' for c in cat_list])
    
    # Numeric features
    numeric_features = ['is_weekend', 'day', 'month', 'hour_of_day', 'sessiondurationsec', 'time_to_purchase', 'promo_code', 'payer']
    feature_names.extend(numeric_features)
    
    return feature_names

# Get feature names
feature_names = get_feature_names(preprocessor)

# Create DataFrame of coefficients
import pandas as pd
import numpy as np

# Ensure coefficient length matches feature names
coefficients = pd.DataFrame({
    'feature': feature_names[:len(model.coef_)],
    'coefficient': model.coef_
})
coefficients = coefficients.sort_values('coefficient', key=abs, ascending=False)
print("\nTop 10 Most Important Features:")
print(coefficients.head(10))

# Additional model diagnostics
print("\nIntercept:", model.intercept_)

In [None]:
# Get feature names

# Combine encoded feature names
encoded_feature_names = (
    channel_encoded_names + 
    [f'region_{r}' for r in region_encoded_names] + 
    [f'device_{d}' for d in device_encoded_names] + 
    [f'payment_type_{p}' for p in payment_type_encoded_names]
)

# Add remaining column names
remaining_feature_names = [
    'is_weekend', 'day', 'month', 
    'hour_of_day', 'sessiondurationsec', 'time_to_purchase',
    'promo_code', 'payer'
]

# Combine all feature names
all_feature_names = encoded_feature_names + remaining_feature_names

# Convert to DataFrame
X_transformed_df = pd.DataFrame(
    X_transformed, 
    columns=all_feature_names,
    index=X.index
)
X_transformed_df

In [None]:
(df['order_dt'] - df['session_start']).dt.total_seconds().fillna(-1).unique()

In [None]:
df[df['time_to_purchase'].isna()]

### `Encoding Categories`

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

def prepare_dataframe(df):
    columns_to_keep = [
        'user_id', 'region', 'device', 'channel', 
        'session_date', 'sessiondurationsec', 'hour_of_day', 
        'day', 'month', 'revenue', 
        'payment_type', 'promo_code', 'payer'
    ]
    return df[columns_to_keep]

def create_feature_encoder():
    channel_order = [
        'organic', 
        'email-рассылки', 
        'реклама у блогеров', 
        'социальные сети', 
        'контекстная реклама'
    ]
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('channel_encoder', 
             OrdinalEncoder(categories=[channel_order]), 
             ['channel']),
            
            ('region_encoder', 
             OneHotEncoder(drop='first', sparse_output=False), 
             ['region']),
            
            ('device_encoder', 
             OneHotEncoder(drop='first', sparse_output=False), 
             ['device']),
            
            ('payment_type_encoder', 
             OneHotEncoder(drop='first', sparse_output=False), 
             ['payment_type'])
        ],
        remainder='passthrough'
    )
    
    return preprocessor

df_prepared = prepare_dataframe(df_replaced)
preprocessor = create_feature_encoder()
X_encoded = preprocessor.fit_transform(df_prepared)

feature_names = (
    ['channel_encoded'] + 
    [f'region_{r}' for r in preprocessor.named_transformers_['region_encoder'].categories_[0][1:]] +
    [f'device_{d}' for d in preprocessor.named_transformers_['device_encoder'].categories_[0][1:]] +
    [f'payment_type_{p}' for p in preprocessor.named_transformers_['payment_type_encoder'].categories_[0][1:]] +
    list(df_prepared.columns[~df_prepared.columns.isin(['region', 'device', 'channel', 'payment_type'])])
)

df_encoded = pd.DataFrame(X_encoded, columns=feature_names, index=df_prepared.index)

def engineer_features(df_encoded):
    # Weekday vs Weekend
    df_encoded['is_weekend'] = df_encoded['day'].isin([6, 7]).astype(int)
    
    # Peak hours (9-17)
    df_encoded['is_peak_hour'] = ((df_encoded['hour_of_day'] >= 9) & 
                                   (df_encoded['hour_of_day'] < 17)).astype(int)
    
    # Session duration buckets
    duration_percentiles = df_encoded['sessiondurationsec'].quantile([0.33, 0.67])
    df_encoded['session_duration_category'] = pd.cut(
        df_encoded['sessiondurationsec'], 
        bins=[-float('inf'), duration_percentiles[0.33], 
               duration_percentiles[0.67], float('inf')],
        labels=[0, 1, 2]
    )
    
    # Interaction features
    region = [f'region_{r}' for r in preprocessor.named_transformers_['region_encoder'].categories_[0][1:]]
    for cat in ['channel_encoded'] + region:  # adjust these to match your encoded column names
        for other in ['hour_of_day', 'day']:
            df_encoded[f'{cat}_{other}_interaction'] = df_encoded[cat] * df_encoded[other]
    
    return df_encoded

df_encoded = engineer_features(df_encoded)
# df_encoded['hour_of_day'] = df_encoded['hour_of_day'].astype(int)
# # Average session duration per channel
# channel_avg_duration = df_encoded.groupby('channel_encoded')['sessiondurationsec'].transform('mean')
# df_encoded['channel_avg_session_duration'] = channel_avg_duration
# # Convert hour to sine/cosine to capture cyclical nature
# df_encoded['hour_sin'] = np.sin(df_encoded['hour_of_day'] * (2 * np.pi / 24))
# df_encoded['hour_cos'] = np.cos(df_encoded['hour_of_day'] * (2 * np.pi / 24))
# # Frequency of payers per channel
# channel_payer_freq = df_encoded.groupby('channel_encoded')['payer'].transform('mean')
# df_encoded['channel_payer_frequency'] = channel_payer_freq

In [None]:
plt.figure(figsize=(40, 40))
sns.heatmap(df_encoded.corr(), annot=True, cmap='coolwarm', fmt='.2f', annot_kws={"fontsize": 20})
plt.title('Feature Correlation Heatmap')
plt.tick_params(axis='x', labelsize=20)
plt.tick_params(axis='y', labelsize=20)
plt.show()

In [5]:
# Aggregate daily revenue
daily_revenue = df_encoded.groupby('session_date')['revenue'].sum().reset_index()

In [6]:
def create_daily_features(df_encoded):
    # Daily aggregations
    daily_features = df_encoded.groupby('session_date').agg({
        'sessiondurationsec': ['mean', 'sum'],
        'channel_encoded': 'mean',
        'hour_of_day': 'mean',
        'day': 'mean',
        'payer': 'sum'  # Number of paying users
    }).reset_index()
    
    # Flatten column names
    daily_features.columns = ['session_date', 'avg_session_duration', 'total_session_duration', 
                               'avg_channel', 'avg_hour', 'avg_day', 'paying_users']
    
    # Lagged features (previous day's revenue)
    daily_features['prev_day_revenue'] = daily_revenue['revenue'].shift(1)
    
    # Day of week feature
    daily_features['day_of_week'] = pd.to_datetime(daily_features['session_date']).dt.dayofweek
    
    return daily_features.dropna()  # Remove first row due to lag

In [7]:
# Prepare features and target
daily_features = create_daily_features(df_encoded)
daily_features = daily_features.merge(daily_revenue, on='session_date')

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Prepare features and target
X = daily_features.drop(['session_date', 'revenue'], axis=1)
y = daily_features['revenue']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    # Fit the model
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Evaluation metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print("Mean Absolute Error:", mae)
    print("Mean Squared Error:", mse)
    print("R-squared Score:", r2)
    
    return model

# Try different models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
}

best_model = None
best_r2 = -float('inf')

for name, model in models.items():
    print(f"\n{name} Results:")
    trained_model = evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test)
    
    # Track best model
    r2 = r2_score(y_test, trained_model.predict(X_test_scaled))
    if r2 > best_r2:
        best_model = trained_model
        best_r2 = r2


Linear Regression Results:
Mean Absolute Error: 1269.6609772249953
Mean Squared Error: 6669599.663132971
R-squared Score: 0.9358713849707991

Ridge Regression Results:
Mean Absolute Error: 1286.227750436165
Mean Squared Error: 6889170.429294408
R-squared Score: 0.9337601984159798

Random Forest Results:
Mean Absolute Error: 1222.4108108108107
Mean Squared Error: 6208740.8253999995
R-squared Score: 0.9403025713208806


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Ensure the columns are datetime objects
df['order_dt'] = pd.to_datetime(df['order_dt'])
df['session_start'] = pd.to_datetime(df['session_start'])

# Calculate the time difference in seconds
time_diff_seconds = (df['order_dt'] - df['session_start']).dt.total_seconds()
print(time_diff_seconds.unique().tolist())
# Create a histogram plot
# sns.histplot(time_diff_seconds)  # kde=True adds a kernel density estimate for smoothness
# plt.title("Time Difference in Seconds between Order and Session Start")
# plt.xlabel("Time Difference (seconds)")
# plt.ylabel("Frequency")
# plt.show()


In [None]:
7 * 3600

In [None]:
(df['sessiondurationsec'] / 3660).unique().max()

In [None]:
(df['session_date'] - df['session_start']).dt.days.unique()


In [None]:
df[df['user_id'] == 324558127766]

In [None]:
(df['order_dt'] - df['session_start']).dt.total_seconds().hist()

In [None]:
time_diff_seconds = (df['session_end'] - df['session_start']).dt.total_seconds()
df_replaced[~(time_diff_seconds >= 0)]

In [None]:
time_diff_seconds = (df['order_dt'] - df['session_start']).dt.total_seconds()
(df_replaced[~(time_diff_seconds >= 0) & (time_diff_seconds.notna())])

In [None]:
mask = df['order_dt'] < df['session_start']
df[mask]

In [None]:
df.loc[mask, 'order_dt'] = df.loc[mask, 'order_dt'].apply(
    lambda x: x.replace(year=df.loc[mask, 'session_start'].dt.year.iloc[0], 
                                    month=df.loc[mask, 'session_start'].dt.month.iloc[0], 
                                    day=df.loc[mask, 'session_start'].dt.day.iloc[0]))


In [None]:
df[mask]

In [None]:
time_diff_seconds.unique().size

In [None]:
time_diff_seconds[time_diff_seconds.notna()].size