In [1]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from pandas import json_normalize
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib_venn import venn2



In [2]:

file_path = '/content/drive/MyDrive/Flockery/train.json'

# Открываем файл и загружаем данные
with open(file_path, 'r') as file:
    data = json.load(file)


In [3]:
df = pd.DataFrame(data.values(), data.keys())
df = df.reset_index()
df = df.rename(columns = {'index': 'user_id'})

df

Unnamed: 0,user_id,target,features
0,user_1,female,"{'orders': [{'site-id': 1, 'orders': [{'create..."
1,user_2,female,"{'visits': [{'site-id': 3, 'first-seen': 16962..."
2,user_3,male,"{'orders': [{'site-id': 21, 'orders': [{'creat..."
3,user_4,male,"{'orders': [{'site-id': 2, 'orders': [{'create..."
4,user_5,female,"{'orders': [{'site-id': 39, 'orders': [{'creat..."
...,...,...,...
127750,user_127751,male,"{'orders': [{'site-id': 307, 'orders': [{'crea..."
127751,user_127752,male,"{'orders': [{'site-id': 147, 'orders': [{'crea..."
127752,user_127753,female,"{'orders': [{'site-id': 20, 'orders': [{'creat..."
127753,user_127754,female,"{'orders': [{'site-id': 407, 'orders': [{'crea..."


In [4]:


def process_dataframe(df):
    # Добавляем столбец 'length', содержащий количество элементов в словаре
    df['length'] = df['features'].apply(lambda x: sum(len(v) if isinstance(v, list) else 1 for v in x.values()))

    # Преобразуем столбец 'features' с помощью json_normalize
    df_filtered = df.join(pd.json_normalize(df['features'])).drop('features', axis='columns')

    return df_filtered

def explode_column(df, level_labels):
    for level in level_labels:
        df = df.explode(level, ignore_index=True)
        df = df.rename(columns={level: level+'_old'})
        df = df.join(pd.json_normalize(df[level+'_old'])).drop(level+'_old', axis='columns')
    return df


def remove_outliers(df):
    Q1 = df['length'].quantile(0.25)
    Q3 = df['length'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.2 * IQR
    upper_bound = Q3 + 1.2 * IQR
    return df[(df['length'] <= upper_bound)]

def with_outliers(df):
    Q1 = df['length'].quantile(0.25)
    Q3 = df['length'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.2 * IQR
    upper_bound = Q3 + 1.2 * IQR
    return df[(df['length'] > upper_bound)]

# Применяем первую часть кода к исходному DataFrame
df_filtered = process_dataframe(df)

 #(комментим эту часть для получения полного DF)Применяем функцию для каждой группы в 'target'
df_orders_flat = df_filtered.groupby('target', group_keys=False).apply(remove_outliers)
# Сбрасываем индексы, чтобы получить DataFrame аналогичный исходному
df_orders_flat = df_orders_flat.reset_index(drop=True)

 #Применяем функцию для каждой группы в 'target'
df_orders_flat_out = df_filtered.groupby('target', group_keys=False).apply(with_outliers)
df_orders_flat_out = df_orders_flat_out.reset_index(drop=True)
'''
 #Применяем функцию для каждой группы в 'target'
df_filtered = df_filtered.groupby('target', group_keys=False).apply(remove_outliers)

# Сбрасываем индексы, чтобы получить DataFrame аналогичный исходному
df_filtered = df_filtered.reset_index(drop=True)
'''
# Применяем функцию для каждой группы в 'target'
#df_filtered_out = df_filtered.groupby('target', group_keys=False).apply(with_outliers)
# Сбрасываем индексы, чтобы получить DataFrame аналогичный исходному
#df_filtered_out = df_filtered_out.reset_index(drop=True)
#df_filtered_out = with_outliers(df_filtered)
# Выбираем нужные столбцы для df_orders
df_orders = df_orders_flat[['user_id', 'target', 'orders', 'length']].copy()
df_orders1 = df_orders_flat_out[['user_id', 'target', 'orders', 'length']].copy()
# Задаем метки уровней для функции explode_column
levels_labels = ['orders', 'orders', 'items']

# Применяем вторую часть кода к df_orders
df_orders_flat1 = explode_column(df_orders, levels_labels)



# Применяем вторую часть кода к df_orders
df_orders_flat_out1 = explode_column(df_orders1, levels_labels)





In [5]:
df_orders_flat1

Unnamed: 0,user_id,target,length,site-id,created-at,id,count,general-category-path,brand-id
0,user_1,female,3,1.0,1.634292e+09,,,,
1,user_1,female,3,2.0,1.639827e+09,,,,
2,user_1,female,3,2.0,1.639828e+09,,,,
3,user_2,female,11,,,,,,
4,user_5,female,35,39.0,1.644809e+09,,,,
...,...,...,...,...,...,...,...,...,...
2122025,user_127751,male,11,16.0,1.648560e+09,item_5409133,1.0,"[90590, 12327586, 90579, 198118]",1446.0
2122026,user_127751,male,11,16.0,1.649580e+09,item_56817,1.0,"[14333188, 10604398, 91009]",3503.0
2122027,user_127752,male,13,147.0,1.612968e+09,,,,
2122028,user_127752,male,13,147.0,1.612969e+09,,,,


In [6]:
df_orders_flat_out1

Unnamed: 0,user_id,target,length,site-id,created-at,id,count,general-category-path,brand-id
0,user_21,female,116,391.0,1.683183e+09,item_1642,34.0,,
1,user_21,female,116,391.0,1.683283e+09,item_1642,34.0,,
2,user_21,female,116,391.0,1.686145e+09,item_1643,40.0,,
3,user_21,female,116,391.0,1.686592e+09,item_1642,27.0,,
4,user_21,female,116,391.0,1.686592e+09,item_1644,2.0,,
...,...,...,...,...,...,...,...,...,...
1348609,user_127743,male,80,112.0,1.682406e+09,item_1660616,,,
1348610,user_127743,male,80,112.0,1.682520e+09,item_3850967,,,
1348611,user_127743,male,80,112.0,1.682520e+09,item_2811078,,,
1348612,user_127743,male,80,112.0,1.682520e+09,item_1248095,,,


In [7]:
df_orders_flat1.to_parquet('/content/drive/MyDrive/Flockery/18_12/df_orders_flat.parquet')

In [8]:
df_orders_flat_out1.to_parquet('/content/drive/MyDrive/Flockery/18_12/df_orders_flat_out.parquet')

In [None]:
df_orders_flat

Unnamed: 0,user_id,target,length,site-id,created-at,id,count,general-category-path,brand-id
0,user_1,female,3,1.0,1.634292e+09,,,,
1,user_1,female,3,2.0,1.639827e+09,,,,
2,user_1,female,3,2.0,1.639828e+09,,,,
3,user_2,female,11,,,,,,
4,user_5,female,35,39.0,1.644809e+09,,,,
...,...,...,...,...,...,...,...,...,...
3189421,user_127751,male,11,16.0,1.648560e+09,item_5409133,1.0,"[90590, 12327586, 90579, 198118]",1446.0
3189422,user_127751,male,11,16.0,1.649580e+09,item_56817,1.0,"[14333188, 10604398, 91009]",3503.0
3189423,user_127752,male,13,147.0,1.612968e+09,,,,
3189424,user_127752,male,13,147.0,1.612969e+09,,,,


In [None]:


def process_dataframe(df):
    # Добавляем столбец 'length', содержащий количество элементов в словаре
    df['length'] = df['features'].apply(lambda x: sum(len(v) if isinstance(v, list) else 1 for v in x.values()))

    # Преобразуем столбец 'features' с помощью json_normalize
    df_filtered = df.join(pd.json_normalize(df['features'])).drop('features', axis='columns')

    return df_filtered

def explode_column(df, level_labels):
    for level in level_labels:
        df = df.explode(level, ignore_index=True)
        df = df.rename(columns={level: level+'_old'})
        df = df.join(pd.json_normalize(df[level+'_old'])).drop(level+'_old', axis='columns')
    return df


def remove_outliers(df):
    Q1 = df['length'].quantile(0.25)
    Q3 = df['length'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.2 * IQR
    upper_bound = Q3 + 1.2 * IQR
    return df[(df['length'] <= upper_bound)]

def with_outliers(df):
    Q1 = df['length'].quantile(0.25)
    Q3 = df['length'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.2 * IQR
    upper_bound = Q3 + 1.2 * IQR
    return df[(df['length'] > upper_bound)]

# Применяем первую часть кода к исходному DataFrame
df_filtered = process_dataframe(df)
'''
 #Применяем функцию для каждой группы в 'target'
df_filtered = df_filtered.groupby('target', group_keys=False).apply(remove_outliers)

# Сбрасываем индексы, чтобы получить DataFrame аналогичный исходному
df_filtered = df_filtered.reset_index(drop=True)
'''
# Применяем функцию для каждой группы в 'target'
#df_filtered_out = df_filtered.groupby('target', group_keys=False).apply(with_outliers)
# Сбрасываем индексы, чтобы получить DataFrame аналогичный исходному
#df_filtered_out = df_filtered_out.reset_index(drop=True)
#df_filtered_out = with_outliers(df_filtered)
# Выбираем нужные столбцы для df_orders
df_orders = df_filtered[['user_id', 'target', 'orders', 'length']].copy()

# Задаем метки уровней для функции explode_column
levels_labels = ['orders', 'orders', 'items']

# Применяем вторую часть кода к df_orders
df_orders_flat1 = explode_column(df_orders, levels_labels)
 #Применяем функцию для каждой группы в 'target'
#df_orders_flat = df_orders_flat.groupby('target', group_keys=False).apply(remove_outliers)

# Сбрасываем индексы, чтобы получить DataFrame аналогичный исходному
#df_filtered = df_filtered.reset_index(drop=True)


# Применяем вторую часть кода к df_orders
#df_orders_flat_out = explode_column(df_orders, levels_labels)


 #Применяем функцию для каждой группы в 'target'
#df_orders_flat_out = df_orders_flat_out.groupby('target', group_keys=False).apply(with_outliers)

In [None]:


def process_dataframe(df):
    # Добавляем столбец 'length', содержащий количество элементов в словаре
    df['length'] = df['features'].apply(lambda x: sum(len(v) if isinstance(v, list) else 1 for v in x.values()))

    # Преобразуем столбец 'features' с помощью json_normalize
    df_filtered = df.join(pd.json_normalize(df['features'])).drop('features', axis='columns')

    return df_filtered

def remove_outliers(df):
    Q1 = df['length'].quantile(0.25)
    Q3 = df['length'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.2 * IQR
    upper_bound = Q3 + 1.2 * IQR
    return df[(df['length'] <= upper_bound)]


def explode_column(df, level_labels):
    for level in level_labels:
        df = df.explode(level, ignore_index=True)
        df = df.rename(columns={level: level+'_old'})
        df = df.join(pd.json_normalize(df[level+'_old'])).drop(level+'_old', axis='columns')
    return df

# Применяем первую часть кода к исходному DataFrame
df_filtered = process_dataframe(df)

# Применяем функцию для каждой группы в 'target'
#df_filtered = df_filtered.groupby('target', group_keys=False).apply(remove_outliers)
# Сбрасываем индексы, чтобы получить DataFrame аналогичный исходному
#df_filtered = df_filtered.reset_index(drop=True)

df_filtered


Unnamed: 0,user_id,target,length,orders,site-meta,visits,last-visits-in-categories,exchange-sessions
0,user_1,female,3,"[{'site-id': 1, 'orders': [{'created-at': 1634...",[{'site-id': 2}],,,
1,user_2,female,11,,"[{'site-id': 4}, {'site-id': 5}, {'site-id': 6...","[{'site-id': 3, 'first-seen': 1696277805, 'las...","[{'category': 'other', 'last-visit-at': 169627...",
2,user_3,male,16,"[{'site-id': 21, 'orders': [{'created-at': 168...","[{'site-id': 24}, {'site-id': 25}, {'site-id':...","[{'site-id': 3, 'first-seen': 1696320204, 'las...","[{'category': 'electronics', 'last-visit-at': ...",
3,user_4,male,27,"[{'site-id': 2, 'orders': [{'created-at': 1600...","[{'site-id': 33}, {'site-id': 2}, {'site-id': ...","[{'site-id': 3, 'first-seen': 1693468067, 'las...","[{'category': 'hypermarket', 'last-visit-at': ...",
4,user_5,female,35,"[{'site-id': 39, 'orders': [{'created-at': 164...","[{'site-id': 42}, {'site-id': 43}, {'site-id':...","[{'site-id': 3, 'first-seen': 1693796766, 'las...","[{'category': 'insurance', 'last-visit-at': 16...",
...,...,...,...,...,...,...,...,...
127750,user_127751,male,11,"[{'site-id': 307, 'orders': [{'created-at': 16...","[{'site-id': 16, 'recency': 1, 'frequency': 1,...","[{'site-id': 3, 'first-seen': 1697627366, 'las...","[{'category': 'other', 'last-visit-at': 169762...",
127751,user_127752,male,13,"[{'site-id': 147, 'orders': [{'created-at': 16...","[{'site-id': 351}, {'site-id': 147, 'recency':...","[{'site-id': 3, 'first-seen': 1697686281, 'las...","[{'category': 'other', 'last-visit-at': 169768...",
127752,user_127753,female,24,"[{'site-id': 20, 'orders': [{'created-at': 169...","[{'site-id': 20, 'recency': 5, 'frequency': 1,...","[{'site-id': 3, 'first-seen': 1682330930, 'las...","[{'category': 'sport', 'last-visit-at': 168425...","[{'landed-at': 1696523158, 'sites': [13, 217, ..."
127753,user_127754,female,8,"[{'site-id': 407, 'orders': [{'created-at': 16...","[{'site-id': 407}, {'site-id': 76}]","[{'site-id': 3, 'first-seen': 1698697647, 'las...","[{'category': 'other', 'last-visit-at': 169869...","[{'landed-at': 1698697649, 'sites': [169, 214,..."


In [None]:
df_filtered.to_parquet('/content/drive/MyDrive/Flockery/train.parquet')

In [None]:
# Выбираем нужные столбцы для df_orders
df_orders = df_filtered[['user_id', 'target', 'orders', 'length']].copy()

# Задаем метки уровней для функции explode_column
levels_labels = ['orders', 'orders', 'items']

# Применяем вторую часть кода к df_orders
df_orders_flat = explode_column(df_orders, levels_labels)


df_orders_flat

Unnamed: 0,user_id,target,length,site-id,created-at,id,count,general-category-path,brand-id
0,user_1,female,3,1.0,1.634292e+09,,,,
1,user_1,female,3,2.0,1.639827e+09,,,,
2,user_1,female,3,2.0,1.639828e+09,,,,
3,user_2,female,11,,,,,,
4,user_3,male,16,21.0,1.684248e+09,,,,
...,...,...,...,...,...,...,...,...,...
3470639,user_127755,female,69,58.0,1.690265e+09,item_162398,1.0,"[15685787, 12699910, 90813]",26642.0
3470640,user_127755,female,69,58.0,1.690265e+09,item_1896437,1.0,"[4922657, 16087732, 90813]",14019.0
3470641,user_127755,female,69,58.0,1.690265e+09,item_3739266,1.0,"[12718223, 90813]",32654.0
3470642,user_127755,female,69,58.0,1.690265e+09,item_57560,1.0,"[12718223, 90813]",5783.0


In [None]:
df_orders_flat.to_parquet('/content/drive/MyDrive/Flockery/df_orders_flat_out.parquet')