### Import the required library

In [2]:
import pandas as pd

### Load the csv file and convert it to be a pandas dataframe and print out the head

In [50]:
df = pd.read_csv("call_logging.csv")
df.head()

Unnamed: 0,time,is_solved,userId,channelName
0,1556374000.0,0,26643,channel_0
1,1546934000.0,0,636656,channel_1
2,1544429000.0,0,485014,channel_2
3,1547461000.0,0,772798,channel_1
4,1546843000.0,0,877701,channel_3


### Clean the dataset

In [53]:
# Checking null values
df.isnull().sum()

time           146601
is_solved           0
userId              0
channelName    219224
dtype: int64

#### How many channels are typically required for a single call?

Count the channels used for each customer
Deactivate multiindexing for easier use

In [4]:
channels_used = df.groupby(["userId"], as_index=False)["channelName"].value_counts()
channels_used

Unnamed: 0,userId,channelName,count
0,1,channel_5,4
1,1,channel_3,2
2,1,channel_0,1
3,1,channel_4,1
4,1,channel_2,1
...,...,...,...
7050162,10001000,channel_19,23
7050163,10001000,channel_15,19
7050164,10001000,channel_18,13
7050165,10001000,channel_21,9


Count how many channels that were used only once for each customer

In [5]:
counts_of_channels_used_only_once = channels_used.groupby(["userId"]).sum()
counts_of_channels_used_only_once

  counts_of_channels_used_only_once = channels_used.groupby(["userId"]).sum()


Unnamed: 0_level_0,count
userId,Unnamed: 1_level_1
1,14
2,6
3,15
4,12
5,15
...,...
10000996,3711
10000997,3712
10000998,3690
10000999,3665


Convert groupby series to dataframe and reset the index

In [6]:
user_df = pd.DataFrame(counts_of_channels_used_only_once).reset_index()
user_df

Unnamed: 0,userId,count
0,1,14
1,2,6
2,3,15
3,4,12
4,5,15
...,...,...
1000966,10000996,3711
1000967,10000997,3712
1000968,10000998,3690
1000969,10000999,3665


Rename the "count" column

In [7]:
user_df.rename(columns={"count": "counts_of_channels_used_only_once"}, inplace=True)

Use arithmetic mean to find the typical channels required for a single call

In [8]:
channels_typically_required_for_a_single_call = int(user_df["counts_of_channels_used_only_once"].mean(numeric_only=True).round())
channels_typically_required_for_a_single_call

15

#### Can you provide a range for the duration of calls?

In [15]:
df["time"].min()

Timestamp('2018-11-30 17:00:00')

In [17]:
df["time"].max()

Timestamp('2019-04-30 16:59:58')

In [18]:
df["time"].max() - df["time"].min()

Timedelta('150 days 23:59:58')

In [23]:
df.loc[df["time"] <= "2019"] 

Unnamed: 0,time,is_solved,userId,channelName
2,2018-12-10 08:08:49,0,485014,channel_2
7,2018-12-07 09:16:41,0,667681,channel_5
9,2018-12-19 12:27:11,0,679094,channel_6
11,2018-12-15 09:09:14,0,355940,channel_3
12,2018-12-15 12:21:06,0,234963,channel_1
...,...,...,...,...
14772636,2018-12-27 04:58:51,1,10000818,channel_8
14772646,2018-12-27 15:15:20,1,10000050,channel_9
14772649,2018-12-12 11:31:50,0,10000324,channel_7
14772654,2018-12-08 08:23:53,0,10000415,channel_7


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14772712 entries, 1285571 to 9999989
Data columns (total 4 columns):
 #   Column       Dtype         
---  ------       -----         
 0   time         datetime64[ns]
 1   is_solved    int64         
 2   userId       int64         
 3   channelName  object        
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 563.5+ MB
