In [1]:
import pandas as pd

## Create a dataframe views with two columns: datetime and user by reading feed-views.log
- convert the datetime to the datetime64[ns] Dtype
- extract the year, month, day, hour, minute, and second from the values of that column to the new columns

In [2]:
views = pd.read_csv("../data/feed_views.log", sep='\t', names=['datetime', 'user'], skiprows=[2, 3], skipfooter=2, engine='python')

views['datetime'] = pd.to_datetime(views['datetime'])

views['year'] = views['datetime'].dt.year
views['month'] = views['datetime'].dt.month
views['day'] = views['datetime'].dt.day
views['hour'] = views['datetime'].dt.hour
views['minute'] = views['datetime'].dt.minute
views['second'] = views['datetime'].dt.second

views.dtypes

datetime    datetime64[ns]
user                object
year                 int32
month                int32
day                  int32
hour                 int32
minute               int32
second               int32
dtype: object

## Create the new column daytime
- you need to assign the particular time of day value if an hour is within a particular interval, for example, afternoon if the hour is larger than 11 and less or equal to 17
- 0 – 3.59 night, 4 – 6.59 early morning, 7 – 10.59 morning, 11 – 16.59 afternoon, 17 – 19.59 early evening, 20 – 23.59 evening
- use the method cut to solve this subtask
- assign the column user as the index

In [3]:
bins = [0, 4, 7, 11, 17, 20, 24]
labels = ['night', 'early morning', 'morning', 'afternoon', 'early evening', 'evening']

views['daytime'] = pd.cut(views['hour'], bins=bins, labels=labels, right=False)
views.set_index('user', inplace=True)
views

Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
artem,2020-04-17 12:01:08.463179,2020,4,17,12,1,8,afternoon
artem,2020-04-17 12:01:23.743946,2020,4,17,12,1,23,afternoon
artem,2020-04-17 12:35:52.735016,2020,4,17,12,35,52,afternoon
oksana,2020-04-17 12:36:21.401412,2020,4,17,12,36,21,afternoon
oksana,2020-04-17 12:36:22.023355,2020,4,17,12,36,22,afternoon
...,...,...,...,...,...,...,...,...
ekaterina,2020-05-21 16:36:40.915488,2020,5,21,16,36,40,afternoon
maxim,2020-05-21 17:49:36.429237,2020,5,21,17,49,36,early evening
valentina,2020-05-21 18:45:20.441142,2020,5,21,18,45,20,early evening
maxim,2020-05-21 23:03:06.457819,2020,5,21,23,3,6,evening


## Calculate the number of elements in your dataframe
- use the method count()
- calculate the number of elements in each time of day category using the method value_counts()

In [4]:
views.count()

datetime    1072
year        1072
month       1072
day         1072
hour        1072
minute      1072
second      1072
daytime     1072
dtype: int64

In [5]:
views['daytime'].value_counts()

daytime
evening          508
afternoon        250
early evening    145
night            129
morning           35
early morning      5
Name: count, dtype: int64

## Sort values in your dataframe by hour, minute, and second in ascending order (simultaneously and not one by one)

In [6]:
views.sort_values(by=['hour', 'minute', 'second'], ascending=True)
views

Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
artem,2020-04-17 12:01:08.463179,2020,4,17,12,1,8,afternoon
artem,2020-04-17 12:01:23.743946,2020,4,17,12,1,23,afternoon
artem,2020-04-17 12:35:52.735016,2020,4,17,12,35,52,afternoon
oksana,2020-04-17 12:36:21.401412,2020,4,17,12,36,21,afternoon
oksana,2020-04-17 12:36:22.023355,2020,4,17,12,36,22,afternoon
...,...,...,...,...,...,...,...,...
ekaterina,2020-05-21 16:36:40.915488,2020,5,21,16,36,40,afternoon
maxim,2020-05-21 17:49:36.429237,2020,5,21,17,49,36,early evening
valentina,2020-05-21 18:45:20.441142,2020,5,21,18,45,20,early evening
maxim,2020-05-21 23:03:06.457819,2020,5,21,23,3,6,evening


## Calculate the minimum and maximum for the hours and the mode for the daytime categories
- calculate the maximum of hour for the rows where the time of day is night
- calculate the minimum of hour for the rows where the time of day is morning
- In addition to this, find out who visited the page at those hours (make one example from that)
- calculate the mode for the hour and daytime

In [7]:
max_hour_night = views[views['daytime'] == 'night']['hour'].max()
max_hour_night

np.int32(3)

In [8]:
min_hour_morning = views[views['daytime'] == 'morning']['hour'].min()
min_hour_morning

np.int32(8)

In [9]:
views[(views['daytime'] == 'night') & (views['hour'] == max_hour_night)]

Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
konstantin,2020-04-19 03:23:35.471598,2020,4,19,3,23,35,night
konstantin,2020-04-19 03:23:55.473926,2020,4,19,3,23,55,night
konstantin,2020-04-19 03:33:07.757714,2020,4,19,3,33,7,night


In [10]:
views[(views['daytime'] == 'morning') & (views['hour'] == min_hour_morning)]

Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
alexander,2020-05-15 08:16:03.918402,2020,5,15,8,16,3,morning
alexander,2020-05-15 08:35:01.471463,2020,5,15,8,35,1,morning


In [17]:
views['hour'].mode()[0]

np.int32(22)

In [15]:
views['daytime'].mode()[0]

'evening'

## Show the 3 earliest hours in the morning and the corresponding usernames and the 3 latest hours and the usernames using nsmallest() and nlargest()

In [22]:
views[views['daytime'] == 'morning'].nsmallest(3, 'hour')

Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
alexander,2020-05-15 08:16:03.918402,2020,5,15,8,16,3,morning
alexander,2020-05-15 08:35:01.471463,2020,5,15,8,35,1,morning
artem,2020-04-24 09:42:47.598208,2020,4,24,9,42,47,morning


In [24]:
views[views['daytime'] == 'morning'].nlargest(3, 'hour')

Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
konstantin,2020-04-18 10:53:52.623447,2020,4,18,10,53,52,morning
maxim,2020-04-18 10:56:55.833899,2020,4,18,10,56,55,morning
konstantin,2020-04-18 10:57:37.331258,2020,4,18,10,57,37,morning


## Use the method describe() to get the basic statistics for the columns
- to find out what the most popular interval for visiting the page is, calculate the interquartile range for the hour by extracting values from the result of the describe() method and store it in the variable iqr


In [25]:
stat = views.describe()
stat

Unnamed: 0,datetime,year,month,day,hour,minute,second
count,1072,1072.0,1072.0,1072.0,1072.0,1072.0,1072.0
mean,2020-05-10 09:30:16.241518592,2020.0,4.872201,13.531716,16.25653,29.602612,29.508396
min,2020-04-17 12:01:08.463179,2020.0,4.0,1.0,0.0,0.0,0.0
25%,2020-05-10 01:13:53.448644096,2020.0,5.0,11.0,13.0,14.0,14.0
50%,2020-05-11 22:48:35.302552832,2020.0,5.0,13.0,19.0,29.0,30.0
75%,2020-05-14 14:30:41.062071296,2020.0,5.0,14.0,22.0,46.0,45.0
max,2020-05-21 23:23:49.995349,2020.0,5.0,30.0,23.0,59.0,59.0
std,,0.0,0.334021,4.901354,6.960369,17.710471,17.424398


In [29]:
iqr = stat.loc['75%', 'hour'] - stat.loc['25%', 'hour']
iqr

np.float64(9.0)