In [1]:
import pandas as pd
import time
from datetime import datetime

In [2]:
data = pd.read_csv('data.tsv', sep='\t')

In [3]:
data.head()

Unnamed: 0,id,date,user_id,duration,medium,source,cost,order_id,amount_paid
0,40443,05.10.2016 23:18,1010,0.000926,seo,google,0.0,6243,20.2
1,35044,09.10.2016 21:40,1036,0.006493,sem,yandex,0.07,6145,15.6
2,40177,05.10.2016 3:23,1041,0.00338,email,promo,0.0,6128,13.2
3,39401,05.10.2016 23:19,1041,0.000463,sem,yandex,0.03,6697,9.8
4,41545,01.10.2016 4:57,1042,0.006493,sem,google,0.06,4510,14.8


In [4]:
def convert_to_datetime(row):
    return datetime.strptime(row['date'], '%d.%m.%Y %H:%M')

In [5]:
data['datetime'] = data.apply(convert_to_datetime, axis=1)

In [6]:
data.head()

Unnamed: 0,id,date,user_id,duration,medium,source,cost,order_id,amount_paid,datetime
0,40443,05.10.2016 23:18,1010,0.000926,seo,google,0.0,6243,20.2,2016-10-05 23:18:00
1,35044,09.10.2016 21:40,1036,0.006493,sem,yandex,0.07,6145,15.6,2016-10-09 21:40:00
2,40177,05.10.2016 3:23,1041,0.00338,email,promo,0.0,6128,13.2,2016-10-05 03:23:00
3,39401,05.10.2016 23:19,1041,0.000463,sem,yandex,0.03,6697,9.8,2016-10-05 23:19:00
4,41545,01.10.2016 4:57,1042,0.006493,sem,google,0.06,4510,14.8,2016-10-01 04:57:00


In [7]:
def make_unix_time(row):
    return time.mktime(row['datetime'].timetuple())

In [8]:
data['unixtime'] = data.apply(make_unix_time, axis=1)
data.head()

Unnamed: 0,id,date,user_id,duration,medium,source,cost,order_id,amount_paid,datetime,unixtime
0,40443,05.10.2016 23:18,1010,0.000926,seo,google,0.0,6243,20.2,2016-10-05 23:18:00,1475699000.0
1,35044,09.10.2016 21:40,1036,0.006493,sem,yandex,0.07,6145,15.6,2016-10-09 21:40:00,1476038000.0
2,40177,05.10.2016 3:23,1041,0.00338,email,promo,0.0,6128,13.2,2016-10-05 03:23:00,1475627000.0
3,39401,05.10.2016 23:19,1041,0.000463,sem,yandex,0.03,6697,9.8,2016-10-05 23:19:00,1475699000.0
4,41545,01.10.2016 4:57,1042,0.006493,sem,google,0.06,4510,14.8,2016-10-01 04:57:00,1475287000.0


In [9]:
data_df = data.groupby('user_id').agg({'unixtime' : ['min', 'max']})

In [10]:
data_df['diff'] = data_df['unixtime']['max'] - data_df['unixtime']['min']

In [11]:
data_df.head()

Unnamed: 0_level_0,unixtime,unixtime,diff
Unnamed: 0_level_1,min,max,Unnamed: 3_level_1
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1010,1475699000.0,1475699000.0,0.0
1036,1476038000.0,1476038000.0,0.0
1041,1475627000.0,1475699000.0,71760.0
1042,1475287000.0,1475934000.0,647340.0
1047,1475318000.0,1475693000.0,374520.0


In [12]:
data_df_filtered = data_df[(data_df['diff'] !=0)]

In [13]:
data_df_filtered.head(20)

Unnamed: 0_level_0,unixtime,unixtime,diff
Unnamed: 0_level_1,min,max,Unnamed: 3_level_1
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1041,1475627000.0,1475699000.0,71760.0
1042,1475287000.0,1475934000.0,647340.0
1047,1475318000.0,1475693000.0,374520.0
1052,1475824000.0,1475945000.0,121020.0
1057,1475569000.0,1475970000.0,401700.0
1100,1475321000.0,1475929000.0,608340.0
1108,1475285000.0,1475991000.0,705780.0
1112,1475634000.0,1475961000.0,326280.0
1113,1475738000.0,1476004000.0,266400.0
1114,1475680000.0,1475766000.0,86280.0


In [14]:
result = data_df_filtered.groupby('user_id').mean()

In [15]:
result['days'] = result['diff'] // 86400

In [16]:
result.head()

Unnamed: 0_level_0,unixtime,unixtime,diff,days
Unnamed: 0_level_1,min,max,Unnamed: 3_level_1,Unnamed: 4_level_1
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1041,1475627000.0,1475699000.0,71760.0,0.0
1042,1475287000.0,1475934000.0,647340.0,7.0
1047,1475318000.0,1475693000.0,374520.0,4.0
1052,1475824000.0,1475945000.0,121020.0,1.0
1057,1475569000.0,1475970000.0,401700.0,4.0


In [17]:
result.drop(labels=['unixtime', 'diff'], axis=1).reset_index()

Unnamed: 0,user_id,days
,,
0.0,1041.0,0.0
1.0,1042.0,7.0
2.0,1047.0,4.0
3.0,1052.0,1.0
4.0,1057.0,4.0
5.0,1100.0,7.0
6.0,1108.0,8.0
7.0,1112.0,3.0
8.0,1113.0,3.0
