In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.calibration import CalibratedClassifierCV

sns.set()
pd.set_option("display.max_columns", None)

In [None]:
import json
import pandas as pd

with open('data/offer_features.json', 'r') as file:
    parsed_json = json.load(file)

df_offer = pd.json_normalize(parsed_json, 'features', ['id'])

In [None]:
with open('data/context_history.json', 'r') as file:
    parsed_json = json.load(file)
    
df_context = pd.json_normalize(parsed_json)

In [None]:
abt = pd.merge(df_context, df_offer, left_on='offer_id', right_on='id', how='left')

In [None]:
target = 'cliked'
logs_start = ""
abt[target].value_counts()/len(abt)

próba zbalansowana

In [None]:
abt.groupby(abt["offer_id"])[target].mean()

In [None]:
abt.shape

In [None]:
zmienne = abt.columns.tolist()
zmienne

In [None]:
# zmienne

In [None]:
abt['request_dttm'] = pd.to_datetime(abt['request_dttm'], format='%Y-%m-%d %H:%M:%S')
data = abt.groupby(abt["request_dttm"].dt.strftime("%Y-%m"))[target].mean()
data.plot.bar()

Zmienna celu stała w czasie

In [None]:
['context.booking.param1','context.booking.param2','context.booking.param3','context.booking.param5','context.booking.param6',
'context.booking.param7','context.booking.param8','context.leg.param5','context.leg.param7','context.leg.param9']

In [None]:
# zmienna datowa
def _add_date_feature(df: pd.DataFrame) -> pd.DataFrame:
    """cechy dotyczące daty"""
    col ='request_dttm'
    df[col] = pd.to_datetime(abt[col], format='%Y-%m-%d %H:%M:%S')
    df["data_year"]= df[col].dt.year
    df["data_month"]= df[col].dt.month
    df["data_day"]= df[col].dt.day
    today = pd.to_datetime(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), format='%Y-%m-%d %H:%M:%S')
    df['days_from_today'] = (df[col] - today).dt.days
    df = df.drop(columns=col)
    return df

In [None]:
abt = _add_date_feature(abt)

Zmienne context.booking.param

In [None]:
for i in range(1, 17):
    param_name = f"context.booking.param{i}"
    print(param_name)
    print(abt[param_name].describe())
    abt[param_name].hist(bins=20)
    plt.xlim(0,15)
    plt.show()

In [None]:
sns.histplot(abt.query("cliked == 0")['context.booking.param5'].apply(np.log), bins=10, label ="0", color = "cornflowerblue")
sns.histplot(abt.query("cliked == 1")['context.booking.param5'].apply(np.log), bins=10, label ="1", color = "firebrick")
plt.legend();

In [None]:
sns.boxplot(y=abt['context.booking.param5'].apply(np.log), x=abt[target])

zmienne 'context.leg.param'

In [None]:
for i in range(1, 16):
    if i == 11:
        continue  
    param_name = f"context.leg.param{i}"
    print(param_name)
    print(abt[param_name].describe())
    abt[param_name].hist(bins=20)
    plt.xlim(0,15)
    plt.show()

zmienne context.param

In [None]:
for i in range(1, 4):
    param_name = f"context.param{i}"
    print(param_name)
    print(abt[param_name].describe())
    abt[param_name].hist(bins=20)
    plt.xlim(0,15)
    plt.show()

zmienne kategoryczna

In [None]:
def _encode_is_banner(df: pd.DataFrame) -> pd.DataFrame:
    col = 'is_banner'
    df[f"{col}_cat"] = df[col].map({'False': 0, 'True': 1})
    df = df.drop(columns=[col])
    return df

In [None]:
abt = _encode_is_banner(abt)

In [None]:
zm_binarne = abt.columns[abt.nunique() < 3]
zm_binarne = zm_binarne.tolist()
zm_binarne = [item for item in zm_binarne if item not in ['cliked']]
zm_binarne

In [None]:
for zm in zm_binarne:
    print(abt[zm].value_counts())

In [None]:
to_drop = ['context.param2'] # jeden poziom
zm_binarne = [item for item in zm_binarne if item not in to_drop]

In [None]:
# puste wartości
abt[zm_binarne].isna().mean().sort_values()

In [None]:
fig, axes = plt.subplots(nrows=len(zm_binarne), ncols=1, figsize=(8, 4 * len(zm_binarne)))

for i, col in enumerate(zm_binarne):
    abt.groupby(col)[target].mean().plot.bar(ax=axes[i])
    axes[i].set_title(f'Mean {target} by {col}')
    axes[i].set_ylabel(f'Mean {target}')

plt.tight_layout()
plt.show()

In [None]:
# czy istnieje statystycznie istotna zależność?
from scipy.stats import chi2_contingency

def cramers_v(x,y):
    contingency_table = pd.crosstab(x,y)
    chi2, p, _, _ = chi2_contingency(contingency_table)
    n = contingency_table.sum().sum()
    rows, cols = contingency_table.shape
    v = np.sqrt(chi2 / (n*(min(rows, cols)-1)))
    return np.round(v,4)

In [None]:
for col in zm_binarne:
    print(col, cramers_v(abt[col], abt[target]))

In [None]:
# odrzucamy hipotezę o niezależności między zmiennymi a targetem dla każdej zmiennej