# Кварацхелия - сессия 1

Цель работы - подготовить датасет с посещениями сайтов для дальнейшего анализа и обучения модели.

План работы:

1. Загрузка исходных таблиц и сайтов
2. Очистка данных от пустых значений
3. Получение разреженной матрицы с мешком сайтов

In [14]:
# Загрузка словаря с сайтами

import pickle

with open("sites.pkl", "rb") as f:
        sites = pickle.load(f)
    
sites

{1: {'url': 'google.com'},
 2: {'url': 'worldoftanks.ru'},
 3: {'url': 'habr.ru'},
 4: {'url': 'news.ru'}}

In [15]:
# загрузка датасета для обучения

import pandas as pd

train_ses = pd.read_csv("train_ses.csv")
train_ses.head()

Unnamed: 0,site_1,time_1,site_2,time_2,site_3,time_3,target
0,1,01.01.2020 01:02:04,2,01.01.2020 01:03:04,3,01.01.2020 02:04:04,0
1,1,01.01.2020 01:02:04,4,01.01.2020 01:03:04,3,01.01.2020 02:04:04,0
2,2,01.01.2020 01:02:04,3,01.01.2020 01:03:04,2,01.01.2020 02:04:04,1
3,2,01.01.2020 01:02:04,3,01.01.2020 01:03:04,1,01.01.2020 02:04:04,1
4,1,01.01.2020 01:02:04,2,01.01.2020 01:03:04,3,01.01.2020 02:04:04,0


In [16]:
# загрузка датасета для тестирования

test_ses = pd.read_csv("test_ses.csv")
test_ses.head()

Unnamed: 0,site_1,time_1,site_2,time_2,site_3,time_3,target
0,1,01.01.2020 01:02:04,2,01.01.2020 01:03:04,3,01.01.2020 02:04:04,
1,1,01.01.2020 01:02:04,4,01.01.2020 01:03:04,3,01.01.2020 02:04:04,
2,2,01.01.2020 01:02:04,3,01.01.2020 01:03:04,2,01.01.2020 02:04:04,


In [17]:
# подготовка пустой разреженной матрицы мешка сайтов (все нули)

data = []
for i in range(len(train_ses)):
    data.append([0]*(len(sites) + 2))

data

[[0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0]]

In [18]:
# названия столбцов в таблице с мешком сайтов. в название столбца вставляется идентификатор сайта

columns = ["date"]
for i in range(len(sites)):
    columns.append(str(i+1))
columns.append('target')

columns

['date', '1', '2', '3', '4', 'target']

In [19]:
# подсчет количества посещенных сайтов для 

train_ses_prep = pd.DataFrame(data, columns=columns)
sites_columns = ["site_1", "site_2", "site_3"]

for i in range(len(train_ses)):
    for c in sites_columns:
        prep_col = str(train_ses[c][i])
        train_ses_prep[prep_col][i] += 1
    train_ses_prep["target"][i] = train_ses["target"][i]
    train_ses_prep["date"][i] = str(train_ses["time_1"][i])[0:10]
        
train_ses_prep

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_ses_prep[prep_col][i] += 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_ses_prep["target"][i] = train_ses["target"][i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_ses_prep["date"][i] = str(train_ses["time_1"][i])[0:10]


Unnamed: 0,date,1,2,3,4,target
0,01.01.2020,1,1,1,0,0
1,01.01.2020,1,0,1,1,0
2,01.01.2020,0,2,1,0,1
3,01.01.2020,1,1,1,0,1
4,01.01.2020,1,1,1,0,0
5,01.01.2020,1,0,1,1,0
6,01.01.2020,0,2,1,0,1
7,01.01.2020,1,1,1,0,1
8,01.01.2020,1,1,1,0,0
9,01.01.2020,1,0,1,1,0


In [26]:
# сохранение подготовленной матрицы мешка сайтов в csv

train_ses_prep.to_csv("train_ses_prep.csv", index=False)

In [29]:
# аналогичные действия для датасета test_ses. здесь нет столбца target

data = []
for i in range(len(test_ses)):
    data.append([0]*(len(sites)+1))
    
columns = ["date"]
for i in range(len(sites)):
    columns.append(str(i+1))
    
test_ses_prep = pd.DataFrame(data, columns=columns)
sites_columns = ["site_1", "site_2", "site_3"]

for i in range(len(test_ses)):
    for c in sites_columns:
        prep_col = str(test_ses[c][i])
        test_ses_prep[prep_col][i] += 1
        test_ses_prep["date"][i] = str(test_ses["time_1"][i])[0:10]
        
test_ses_prep.to_csv("test_ses_prep.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_ses_prep[prep_col][i] += 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_ses_prep["date"][i] = str(test_ses["time_1"][i])[0:10]
