# Ex01 Basic operations

In [28]:
import pandas as pd

## create dataframe

In [29]:
data = pd.read_csv('data/feed-views.log', 
                   sep = '\t', 
                   names= ['datetime', 'user'],
                   parse_dates = ['datetime'])

In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1076 entries, 0 to 1075
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   datetime  1076 non-null   datetime64[ns]
 1   user      1076 non-null   object        
dtypes: datetime64[ns](1), object(1)
memory usage: 16.9+ KB


In [31]:
data.count()

datetime    1076
user        1076
dtype: int64

In [32]:
data.dtypes

datetime    datetime64[ns]
user                object
dtype: object

In [33]:
data['year'] = data.datetime.dt.year
data['month'] = data.datetime.dt.month
data['day'] = data.datetime.dt.day
data['hour'] = data.datetime.dt.hour
data['minute'] = data.datetime.dt.minute
data['second'] = data.datetime.dt.second

In [34]:
data.head()

Unnamed: 0,datetime,user,year,month,day,hour,minute,second
0,2020-04-17 12:01:08.463179,artem,2020,4,17,12,1,8
1,2020-04-17 12:01:23.743946,artem,2020,4,17,12,1,23
2,2020-04-17 12:27:30.646665,artem,2020,4,17,12,27,30
3,2020-04-17 12:35:44.884757,artem,2020,4,17,12,35,44
4,2020-04-17 12:35:52.735016,artem,2020,4,17,12,35,52


In [54]:
data.count()

datetime    1076
user        1076
year        1076
month       1076
day         1076
hour        1076
minute      1076
second      1076
day_time    1076
dtype: int64

## new column day time

In [35]:
cut_bins = [0, 4, 7, 11, 17, 20, 24]
labels_bins = ['night', 'early morning', 'morning', 'day', 'early evening', 'evening']
data['day_time'] = pd.cut(data.hour, bins = cut_bins,  labels = labels_bins, right = False)

In [36]:
pd.cut(data.hour, bins = cut_bins, right = False)

0       [11, 17)
1       [11, 17)
2       [11, 17)
3       [11, 17)
4       [11, 17)
          ...   
1071    [17, 20)
1072    [20, 24)
1073    [20, 24)
1074    [20, 24)
1075     [7, 11)
Name: hour, Length: 1076, dtype: category
Categories (6, interval[int64, left]): [[0, 4) < [4, 7) < [7, 11) < [11, 17) < [17, 20) < [20, 24)]

In [37]:
data.set_index('user')

Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,day_time
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
artem,2020-04-17 12:01:08.463179,2020,4,17,12,1,8,day
artem,2020-04-17 12:01:23.743946,2020,4,17,12,1,23,day
artem,2020-04-17 12:27:30.646665,2020,4,17,12,27,30,day
artem,2020-04-17 12:35:44.884757,2020,4,17,12,35,44,day
artem,2020-04-17 12:35:52.735016,2020,4,17,12,35,52,day
...,...,...,...,...,...,...,...,...
valentina,2020-05-21 18:45:20.441142,2020,5,21,18,45,20,early evening
maxim,2020-05-21 23:03:06.457819,2020,5,21,23,3,6,evening
pavel,2020-05-21 23:23:49.995349,2020,5,21,23,23,49,evening
artem,2020-05-21 23:49:22.386789,2020,5,21,23,49,22,evening


In [38]:
data.day_time.value_counts()

day_time
evening          509
day              252
early evening    145
night            129
morning           36
early morning      5
Name: count, dtype: int64

## count samples

In [39]:
data.count()

datetime    1076
user        1076
year        1076
month       1076
day         1076
hour        1076
minute      1076
second      1076
day_time    1076
dtype: int64

In [40]:
data.day_time.value_counts()

day_time
evening          509
day              252
early evening    145
night            129
morning           36
early morning      5
Name: count, dtype: int64

## sort time

In [41]:
data.sort_values(['hour','minute','second'])

Unnamed: 0,datetime,user,year,month,day,hour,minute,second,day_time
944,2020-05-15 00:00:13.222265,valentina,2020,5,15,0,0,13,night
945,2020-05-15 00:01:05.153738,valentina,2020,5,15,0,1,5,night
563,2020-05-12 00:01:27.764025,pavel,2020,5,12,0,1,27,night
564,2020-05-12 00:01:38.444917,pavel,2020,5,12,0,1,38,night
565,2020-05-12 00:01:55.395042,pavel,2020,5,12,0,1,55,night
...,...,...,...,...,...,...,...,...,...
1074,2020-05-21 23:49:22.386789,artem,2020,5,21,23,49,22,evening
246,2020-05-09 23:53:55.599821,anatoliy,2020,5,9,23,53,55,evening
247,2020-05-09 23:54:54.260791,pavel,2020,5,9,23,54,54,evening
942,2020-05-14 23:58:56.754866,valentina,2020,5,14,23,58,56,evening


## min and max for day time

In [42]:
max_night = data.query('day_time == "night"').hour.max()
min_morning = data.query('day_time == "morning"').hour.min()
print(f"max hour night: {max_night} \nmin hour morning: {min_morning}")

max hour night: 3 
min hour morning: 8


### who visited at max_night and min_morning

In [43]:
data.query('hour == @max_night').user

46    konstantin
47    konstantin
48    konstantin
Name: user, dtype: object

In [44]:
data.query('hour == @min_morning').user

963    alexander
964    alexander
Name: user, dtype: object

### day and hour mode

In [45]:
data.hour.mode()

0    22
Name: hour, dtype: int32

In [46]:
data.day.mode()

0    11
Name: day, dtype: int32

In [47]:
data.day_time.mode()

0    evening
Name: day_time, dtype: category
Categories (6, object): ['night' < 'early morning' < 'morning' < 'day' < 'early evening' < 'evening']

### nsmallest and nlargest hours for morning

In [48]:
smallest_morning = data[data.day_time == 'morning'].hour.drop_duplicates().nsmallest(3)
smallest_morning

963     8
88      9
12     10
Name: hour, dtype: int32

In [49]:
largest_morning = data[data.day_time == 'morning'].hour.drop_duplicates().nlargest(3)
largest_morning

12     10
88      9
963     8
Name: hour, dtype: int32

In [50]:
data[data.hour.isin(smallest_morning)].user.unique()

array(['konstantin', 'maxim', 'aleksey', 'artem', 'anatoliy', 'alexander',
       'ekaterina'], dtype=object)

In [51]:
data[data.hour.isin(largest_morning)].user.unique()

array(['konstantin', 'maxim', 'aleksey', 'artem', 'anatoliy', 'alexander',
       'ekaterina'], dtype=object)

## describe

In [52]:
data.describe()

Unnamed: 0,datetime,year,month,day,hour,minute,second
count,1076,1076.0,1076.0,1076.0,1076.0,1076.0,1076.0
mean,2020-05-10 09:00:41.211420672,2020.0,4.870818,13.552974,16.249071,29.629182,29.500929
min,2020-04-17 12:01:08.463179,2020.0,4.0,1.0,0.0,0.0,0.0
25%,2020-05-10 01:13:49.857472,2020.0,5.0,11.0,13.0,14.0,14.0
50%,2020-05-11 22:48:35.302552832,2020.0,5.0,13.0,19.0,29.0,30.0
75%,2020-05-14 14:44:34.749530624,2020.0,5.0,15.0,22.0,46.0,45.0
max,2020-05-22 10:36:14.662600,2020.0,5.0,30.0,23.0,59.0,59.0
std,,0.0,0.335557,4.906567,6.95549,17.689388,17.405506


In [53]:
iqr = [int(data.hour.quantile(0.25)), int(data.hour.quantile(0.75))]
iqr


[13, 22]