In [1]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from pandas import json_normalize
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib_venn import venn2



In [4]:

file_path = '/content/drive/MyDrive/Flockery/train.json'

# Открываем файл и загружаем данные
with open(file_path, 'r') as file:
    data = json.load(file)


In [5]:
df = pd.DataFrame(data.values(), data.keys())
df = df.reset_index()
df = df.rename(columns = {'index': 'user_id'})

df

Unnamed: 0,user_id,target,features
0,user_1,female,"{'orders': [{'site-id': 1, 'orders': [{'create..."
1,user_2,female,"{'visits': [{'site-id': 3, 'first-seen': 16962..."
2,user_3,male,"{'orders': [{'site-id': 21, 'orders': [{'creat..."
3,user_4,male,"{'orders': [{'site-id': 2, 'orders': [{'create..."
4,user_5,female,"{'orders': [{'site-id': 39, 'orders': [{'creat..."
...,...,...,...
127750,user_127751,male,"{'orders': [{'site-id': 307, 'orders': [{'crea..."
127751,user_127752,male,"{'orders': [{'site-id': 147, 'orders': [{'crea..."
127752,user_127753,female,"{'orders': [{'site-id': 20, 'orders': [{'creat..."
127753,user_127754,female,"{'orders': [{'site-id': 407, 'orders': [{'crea..."


In [6]:


def process_dataframe(df):
    # Добавляем столбец 'length', содержащий количество элементов в словаре
    df['length'] = df['features'].apply(lambda x: sum(len(v) if isinstance(v, list) else 1 for v in x.values()))

    # Преобразуем столбец 'features' с помощью json_normalize
    df_filtered = df.join(pd.json_normalize(df['features'])).drop('features', axis='columns')

    return df_filtered

def remove_outliers(df):
    Q1 = df['length'].quantile(0.25)
    Q3 = df['length'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.2 * IQR
    upper_bound = Q3 + 1.2 * IQR
    return df[(df['length'] <= upper_bound)]


def explode_column(df, level_labels):
    for level in level_labels:
        df = df.explode(level, ignore_index=True)
        df = df.rename(columns={level: level+'_old'})
        df = df.join(pd.json_normalize(df[level+'_old'])).drop(level+'_old', axis='columns')
    return df

# Применяем первую часть кода к исходному DataFrame
df_filtered = process_dataframe(df)

# Применяем функцию для каждой группы в 'target'
df_filtered = df_filtered.groupby('target', group_keys=False).apply(remove_outliers)

# Сбрасываем индексы, чтобы получить DataFrame аналогичный исходному
df_filtered = df_filtered.reset_index(drop=True)

df_filtered


Unnamed: 0,user_id,target,length,orders,site-meta,visits,last-visits-in-categories,exchange-sessions
0,user_1,female,3,"[{'site-id': 1, 'orders': [{'created-at': 1634...",[{'site-id': 2}],,,
1,user_2,female,11,,"[{'site-id': 4}, {'site-id': 5}, {'site-id': 6...","[{'site-id': 3, 'first-seen': 1696277805, 'las...","[{'category': 'other', 'last-visit-at': 169627...",
2,user_5,female,35,"[{'site-id': 39, 'orders': [{'created-at': 164...","[{'site-id': 42}, {'site-id': 43}, {'site-id':...","[{'site-id': 3, 'first-seen': 1693796766, 'las...","[{'category': 'insurance', 'last-visit-at': 16...",
3,user_6,female,53,"[{'site-id': 123, 'orders': [{'created-at': 16...","[{'site-id': 146, 'recency': 1, 'frequency': 1...","[{'site-id': 21, 'first-seen': 1698263297, 'la...","[{'category': 'furniture', 'last-visit-at': 16...","[{'landed-at': 1698263812, 'sites': [169, 214,..."
4,user_7,female,20,,[{'site-id': 20}],"[{'site-id': 225, 'first-seen': 1690061422, 'l...","[{'category': 'hypermarket', 'last-visit-at': ...",
...,...,...,...,...,...,...,...,...
119546,user_127746,male,10,"[{'site-id': 173, 'orders': [{'created-at': 14...","[{'site-id': 173, 'recency': 1, 'frequency': 1...","[{'site-id': 3, 'first-seen': 1697040818, 'las...","[{'category': 'entertainment_services', 'last-...",
119547,user_127748,male,16,"[{'site-id': 348, 'orders': [{'created-at': 16...","[{'site-id': 348}, {'site-id': 7}]","[{'site-id': 3, 'first-seen': 1698553273, 'las...","[{'category': 'insurance', 'last-visit-at': 16...",
119548,user_127750,male,11,"[{'site-id': 41, 'orders': [{'created-at': 169...",[{'site-id': 41}],"[{'site-id': 206, 'first-seen': 1698818208, 'l...","[{'category': 'hypermarket', 'last-visit-at': ...",
119549,user_127751,male,11,"[{'site-id': 307, 'orders': [{'created-at': 16...","[{'site-id': 16, 'recency': 1, 'frequency': 1,...","[{'site-id': 3, 'first-seen': 1697627366, 'las...","[{'category': 'other', 'last-visit-at': 169762...",


In [7]:
df_filtered.to_parquet('/content/drive/MyDrive/Flockery/train.parquet')

In [8]:
# Выбираем нужные столбцы для df_orders
df_orders = df_filtered[['user_id', 'target', 'orders', 'length']].copy()

# Задаем метки уровней для функции explode_column
levels_labels = ['orders', 'orders', 'items']

# Применяем вторую часть кода к df_orders
df_orders_flat = explode_column(df_orders, levels_labels)


df_orders_flat

Unnamed: 0,user_id,target,length,site-id,created-at,id,count,general-category-path,brand-id
0,user_1,female,3,1.0,1.634292e+09,,,,
1,user_1,female,3,2.0,1.639827e+09,,,,
2,user_1,female,3,2.0,1.639828e+09,,,,
3,user_2,female,11,,,,,,
4,user_5,female,35,39.0,1.644809e+09,,,,
...,...,...,...,...,...,...,...,...,...
2122025,user_127751,male,11,16.0,1.648560e+09,item_5409133,1.0,"[90590, 12327586, 90579, 198118]",1446.0
2122026,user_127751,male,11,16.0,1.649580e+09,item_56817,1.0,"[14333188, 10604398, 91009]",3503.0
2122027,user_127752,male,13,147.0,1.612968e+09,,,,
2122028,user_127752,male,13,147.0,1.612969e+09,,,,


In [9]:
df_orders_flat.to_parquet('/content/drive/MyDrive/Flockery/df_orders_flat.parquet')