# **Data Collection and Preparation - Charging Sessions**

In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
try:
    df = pd.read_csv('resources/csv_files/charging_sessions.csv')
    print("Successfully imported charging_sessions.csv")
except FileNotFoundError:
    print("Error: charging_sessions.csv not found.")

Successfully imported charging_sessions.csv


In [3]:
df

Unnamed: 0.1,Unnamed: 0,id,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,sessionID,siteID,spaceID,stationID,timezone,userID,userInputs
0,0,5e23b149f9af8b5fe4b973cf,2020-01-02 13:08:54+00:00,2020-01-02 19:11:15+00:00,2020-01-02 17:31:35+00:00,25.016,1_1_179_810_2020-01-02 13:08:53.870034,1,AG-3F30,1-1-179-810,America/Los_Angeles,194.0,"[{'WhPerMile': 250, 'kWhRequested': 25.0, 'mil..."
1,1,5e23b149f9af8b5fe4b973d0,2020-01-02 13:36:50+00:00,2020-01-02 22:38:21+00:00,2020-01-02 20:18:05+00:00,33.097,1_1_193_825_2020-01-02 13:36:49.599853,1,AG-1F01,1-1-193-825,America/Los_Angeles,4275.0,"[{'WhPerMile': 280, 'kWhRequested': 70.0, 'mil..."
2,2,5e23b149f9af8b5fe4b973d1,2020-01-02 13:56:35+00:00,2020-01-03 00:39:22+00:00,2020-01-02 16:35:06+00:00,6.521,1_1_193_829_2020-01-02 13:56:35.214993,1,AG-1F03,1-1-193-829,America/Los_Angeles,344.0,"[{'WhPerMile': 400, 'kWhRequested': 8.0, 'mile..."
3,3,5e23b149f9af8b5fe4b973d2,2020-01-02 13:59:58+00:00,2020-01-02 16:38:39+00:00,2020-01-02 15:18:45+00:00,2.355,1_1_193_820_2020-01-02 13:59:58.309319,1,AG-1F04,1-1-193-820,America/Los_Angeles,1117.0,"[{'WhPerMile': 400, 'kWhRequested': 8.0, 'mile..."
4,4,5e23b149f9af8b5fe4b973d3,2020-01-02 14:00:01+00:00,2020-01-02 22:08:40+00:00,2020-01-02 18:17:30+00:00,13.375,1_1_193_819_2020-01-02 14:00:00.779967,1,AG-1F06,1-1-193-819,America/Los_Angeles,334.0,"[{'WhPerMile': 400, 'kWhRequested': 16.0, 'mil..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
66445,10083,5d574ad2f9af8b4c10c03652,2019-07-31 18:08:04+00:00,2019-07-31 23:29:18+00:00,2019-07-31 23:30:18+00:00,28.787,1_1_179_809_2019-07-31 18:08:04.432654,1,AG-3F27,1-1-179-809,America/Los_Angeles,393.0,"[{'WhPerMile': 240, 'kWhRequested': 31.2, 'mil..."
66446,10084,5d574ad2f9af8b4c10c03653,2019-07-31 18:40:41+00:00,2019-08-01 00:59:42+00:00,2019-07-31 21:44:23+00:00,7.787,1_1_179_810_2019-07-31 18:40:40.900203,1,AG-3F30,1-1-179-810,America/Los_Angeles,220.0,"[{'WhPerMile': 333, 'kWhRequested': 6.66, 'mil..."
66447,10085,5d574ad2f9af8b4c10c03654,2019-07-31 19:04:40+00:00,2019-07-31 22:44:22+00:00,2019-07-31 22:45:21+00:00,11.274,1_1_191_795_2019-07-31 19:04:40.098273,1,AG-4F51,1-1-191-795,America/Los_Angeles,1974.0,"[{'WhPerMile': 333, 'kWhRequested': 19.98, 'mi..."
66448,10086,5d574ad2f9af8b4c10c03655,2019-07-31 19:19:47+00:00,2019-08-01 00:34:51+00:00,2019-07-31 21:25:30+00:00,11.589,1_1_191_778_2019-07-31 19:19:46.919358,1,AG-4F43,1-1-191-778,America/Los_Angeles,942.0,"[{'WhPerMile': 275, 'kWhRequested': 22.0, 'mil..."


### Format
We first observe that the .csv file's first column is an unnamed column with integers. Since there is no description in the .pdf file for the team assignment, we will drop these incoherent values and get rid of the very first column.

In [4]:
df = df.drop(df.columns[0], axis=1)

Furthermore, if we take a look at the ***userInputs*** column, we see that the provided data is in *json*-format.
For easier handling, we want to transform this column to an appropriate format for our analysis.
We will thus create a new column for each field in the json.

In [5]:
def parse_json(value):
    if isinstance(value, str):
        json_value = value.replace("'", '"')
        json_value = json_value.replace("True", "true").replace("False", "false")

        try:
            return json.loads(json_value)
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON: {e}")
            return None
    return None


df['parsedJson'] = df['userInputs'].apply(parse_json)


all_keys = set()
for row in df['parsedJson'].dropna():
    if isinstance(row, list) and row:
        all_keys.update(row[0].keys())

for field in all_keys:
    df[field] = df['parsedJson'].apply(
        lambda x: x[0].get(field) if isinstance(x, list) and x else np.nan
    )

df = df.drop(columns=['userInputs', 'parsedJson'])
df

Unnamed: 0,id,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,sessionID,siteID,spaceID,stationID,timezone,userID,milesRequested,requestedDeparture,WhPerMile,minutesAvailable,paymentRequired,kWhRequested,modifiedAt
0,5e23b149f9af8b5fe4b973cf,2020-01-02 13:08:54+00:00,2020-01-02 19:11:15+00:00,2020-01-02 17:31:35+00:00,25.016,1_1_179_810_2020-01-02 13:08:53.870034,1,AG-3F30,1-1-179-810,America/Los_Angeles,194.0,100.0,"Thu, 02 Jan 2020 20:51:54 GMT",250.0,463.0,True,25.00,"Thu, 02 Jan 2020 13:09:39 GMT"
1,5e23b149f9af8b5fe4b973d0,2020-01-02 13:36:50+00:00,2020-01-02 22:38:21+00:00,2020-01-02 20:18:05+00:00,33.097,1_1_193_825_2020-01-02 13:36:49.599853,1,AG-1F01,1-1-193-825,America/Los_Angeles,4275.0,250.0,"Thu, 02 Jan 2020 23:31:50 GMT",280.0,595.0,True,70.00,"Thu, 02 Jan 2020 13:37:11 GMT"
2,5e23b149f9af8b5fe4b973d1,2020-01-02 13:56:35+00:00,2020-01-03 00:39:22+00:00,2020-01-02 16:35:06+00:00,6.521,1_1_193_829_2020-01-02 13:56:35.214993,1,AG-1F03,1-1-193-829,America/Los_Angeles,344.0,20.0,"Thu, 02 Jan 2020 14:56:35 GMT",400.0,60.0,True,8.00,"Thu, 02 Jan 2020 13:57:17 GMT"
3,5e23b149f9af8b5fe4b973d2,2020-01-02 13:59:58+00:00,2020-01-02 16:38:39+00:00,2020-01-02 15:18:45+00:00,2.355,1_1_193_820_2020-01-02 13:59:58.309319,1,AG-1F04,1-1-193-820,America/Los_Angeles,1117.0,20.0,"Thu, 02 Jan 2020 15:04:58 GMT",400.0,65.0,True,8.00,"Thu, 02 Jan 2020 14:00:03 GMT"
4,5e23b149f9af8b5fe4b973d3,2020-01-02 14:00:01+00:00,2020-01-02 22:08:40+00:00,2020-01-02 18:17:30+00:00,13.375,1_1_193_819_2020-01-02 14:00:00.779967,1,AG-1F06,1-1-193-819,America/Los_Angeles,334.0,40.0,"Thu, 02 Jan 2020 22:24:01 GMT",400.0,504.0,True,16.00,"Thu, 02 Jan 2020 14:00:13 GMT"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66445,5d574ad2f9af8b4c10c03652,2019-07-31 18:08:04+00:00,2019-07-31 23:29:18+00:00,2019-07-31 23:30:18+00:00,28.787,1_1_179_809_2019-07-31 18:08:04.432654,1,AG-3F27,1-1-179-809,America/Los_Angeles,393.0,130.0,"Thu, 01 Aug 2019 00:03:04 GMT",240.0,355.0,True,31.20,"Wed, 31 Jul 2019 18:08:23 GMT"
66446,5d574ad2f9af8b4c10c03653,2019-07-31 18:40:41+00:00,2019-08-01 00:59:42+00:00,2019-07-31 21:44:23+00:00,7.787,1_1_179_810_2019-07-31 18:40:40.900203,1,AG-3F30,1-1-179-810,America/Los_Angeles,220.0,20.0,"Thu, 01 Aug 2019 02:15:41 GMT",333.0,455.0,True,6.66,"Wed, 31 Jul 2019 18:41:02 GMT"
66447,5d574ad2f9af8b4c10c03654,2019-07-31 19:04:40+00:00,2019-07-31 22:44:22+00:00,2019-07-31 22:45:21+00:00,11.274,1_1_191_795_2019-07-31 19:04:40.098273,1,AG-4F51,1-1-191-795,America/Los_Angeles,1974.0,60.0,"Wed, 31 Jul 2019 22:08:40 GMT",333.0,184.0,True,19.98,"Wed, 31 Jul 2019 19:04:57 GMT"
66448,5d574ad2f9af8b4c10c03655,2019-07-31 19:19:47+00:00,2019-08-01 00:34:51+00:00,2019-07-31 21:25:30+00:00,11.589,1_1_191_778_2019-07-31 19:19:46.919358,1,AG-4F43,1-1-191-778,America/Los_Angeles,942.0,80.0,"Wed, 31 Jul 2019 20:19:47 GMT",275.0,60.0,True,22.00,"Wed, 31 Jul 2019 19:20:10 GMT"


Now we take a look at the datatypes in our dataframe:

In [6]:
df.dtypes

id                     object
connectionTime         object
disconnectTime         object
doneChargingTime       object
kWhDelivered          float64
sessionID              object
siteID                  int64
spaceID                object
stationID              object
timezone               object
userID                float64
milesRequested        float64
requestedDeparture     object
WhPerMile             float64
minutesAvailable      float64
paymentRequired        object
kWhRequested          float64
modifiedAt             object
dtype: object

For easier handling during our analysis, it might be benificial to convert the values of **connectionTime**, **disconnectTime**, **doneChargingtime**, **modifiedAt** and **requestedDeparture** from type object to datetime. Further, we know by the supplemented document of the team assignment that all datetimes are in UTC. For easier use, we will convert all datetimes from Timezone *UTC* to *America/Los Angeles*.

In [7]:
df['connectionTime'] = pd.to_datetime(df['connectionTime'], utc=True, errors='coerce').dt.tz_convert('America/Los_Angeles')
df['disconnectTime'] = pd.to_datetime(df['disconnectTime'], utc=True, errors='coerce').dt.tz_convert('America/Los_Angeles')
df['doneChargingTime'] = pd.to_datetime(df['doneChargingTime'], utc=True, errors='coerce').dt.tz_convert('America/Los_Angeles')
df['modifiedAt'] = pd.to_datetime(df['modifiedAt'], utc=True, errors='coerce').dt.tz_convert('America/Los_Angeles')
df['requestedDeparture'] = pd.to_datetime(df['requestedDeparture'], utc=True, errors='coerce').dt.tz_convert('America/Los_Angeles')
df.dtypes

id                                                 object
connectionTime        datetime64[ns, America/Los_Angeles]
disconnectTime        datetime64[ns, America/Los_Angeles]
doneChargingTime      datetime64[ns, America/Los_Angeles]
kWhDelivered                                      float64
sessionID                                          object
siteID                                              int64
spaceID                                            object
stationID                                          object
timezone                                           object
userID                                            float64
milesRequested                                    float64
requestedDeparture    datetime64[ns, America/Los_Angeles]
WhPerMile                                         float64
minutesAvailable                                  float64
paymentRequired                                    object
kWhRequested                                      float64
modifiedAt    

In [8]:
df.head(3)

Unnamed: 0,id,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,sessionID,siteID,spaceID,stationID,timezone,userID,milesRequested,requestedDeparture,WhPerMile,minutesAvailable,paymentRequired,kWhRequested,modifiedAt
0,5e23b149f9af8b5fe4b973cf,2020-01-02 05:08:54-08:00,2020-01-02 11:11:15-08:00,2020-01-02 09:31:35-08:00,25.016,1_1_179_810_2020-01-02 13:08:53.870034,1,AG-3F30,1-1-179-810,America/Los_Angeles,194.0,100.0,2020-01-02 12:51:54-08:00,250.0,463.0,True,25.0,2020-01-02 05:09:39-08:00
1,5e23b149f9af8b5fe4b973d0,2020-01-02 05:36:50-08:00,2020-01-02 14:38:21-08:00,2020-01-02 12:18:05-08:00,33.097,1_1_193_825_2020-01-02 13:36:49.599853,1,AG-1F01,1-1-193-825,America/Los_Angeles,4275.0,250.0,2020-01-02 15:31:50-08:00,280.0,595.0,True,70.0,2020-01-02 05:37:11-08:00
2,5e23b149f9af8b5fe4b973d1,2020-01-02 05:56:35-08:00,2020-01-02 16:39:22-08:00,2020-01-02 08:35:06-08:00,6.521,1_1_193_829_2020-01-02 13:56:35.214993,1,AG-1F03,1-1-193-829,America/Los_Angeles,344.0,20.0,2020-01-02 06:56:35-08:00,400.0,60.0,True,8.0,2020-01-02 05:57:17-08:00


Let's also take a look at the amount of distinct values for each column.

In [9]:
df.nunique(dropna=True)

id                    65037
connectionTime        64839
disconnectTime        64906
doneChargingTime      60637
kWhDelivered          25629
sessionID             65037
siteID                    2
spaceID                 107
stationID               107
timezone                  1
userID                 1006
milesRequested          106
requestedDeparture    47773
WhPerMile               178
minutesAvailable        842
paymentRequired           1
kWhRequested           1201
modifiedAt            47704
dtype: int64

We observe that for the columns **timezone** and **paymentRequired**, there is only **one** unique value which is not null.
Thus, we can drop them as they don't add any value to our analysis since they can't be used to compare / differentiate rows.

In [10]:
df = df.drop(columns=['timezone','paymentRequired'])

Further, we see that there is the same amount of spaceIDs as stationIDs (both 107). Let's take a look at how many different combinations there are.

In [11]:
df.drop_duplicates(subset=['stationID', 'spaceID']).shape[0]

107

We observe that there are also only 107 possible combinations. This means that there is a 1:1 mapping between stationIDs and spaceIDs. Keeping both is unnecessary as the information would be redundant, thus we will drop the *spaceID*.

In [12]:
df = df.drop(columns='spaceID', axis=1)

We also see that there are 2 siteIDs (which we would also expect). Let's just check in case it could be Null.

In [13]:
len(df[df.siteID.isnull()])

0

Perfect! This means that we can encode them for easier handling. Instead of values [1, 2], we will encode those in a column *isSiteOne* with values [0, 1]. This is better suited for our later machine learning algorithms.

In [14]:
df.loc[:, 'isSiteOne'] = (df['siteID'] == 1).astype(int)
df = df.drop(columns=['siteID'])

### Duplicates
First we want to find out whether any duplicates exist.
If so, those duplicates shall be removed.

In [15]:
len(df[df.duplicated()])

1413

As we can see, there are **1413** duplicated rows. We will thus remove them for further analysis.

In [16]:
df = df.drop_duplicates()

### Missing Data
We now want to check the data for missing values

In [17]:
df.isnull().sum()

id                        0
connectionTime            0
disconnectTime            0
doneChargingTime       4087
kWhDelivered              0
sessionID                 0
stationID                 0
userID                17215
milesRequested        17215
requestedDeparture    17215
WhPerMile             17215
minutesAvailable      17215
kWhRequested          17215
modifiedAt            17215
isSiteOne                 0
dtype: int64

As we can see, there are **4087** rows with a missing **doneChargingTime** value as well as **17215** rows with missing **userID** and **userInputs** value. The latter is expected as charging sessions may also be carried out by unregistered users. Thus, we only have to take care of the **doneChargingTime** values. We will do this with the help of a new variable.

## New features

We will also add some new features which we consider useful for our further analysis

In [18]:
df['dayOfWeek'] = df['connectionTime'].dt.day_name()
df['month'] = df['connectionTime'].dt.month_name()
df['connectionDuration'] = (df['disconnectTime'] - df['connectionTime']).dt.total_seconds() / 3600
df['chargingDuration'] = (df['doneChargingTime'] - df['connectionTime']).dt.total_seconds() / 3600
#df['idleDuration'] = abs(df['chargingDuration'] - df['connectionDuration'])
#df['kWh_diff'] = np.where(
#    df['userID'].notnull(),
#    df['kWhRequested'] - df['kWhDelivered'],
#    np.nan
#)

cols = df.columns.tolist()
df = df[cols[:2] + ['connectionDuration', 'chargingDuration'] + cols[2:-2]]

df

Unnamed: 0,id,connectionTime,connectionDuration,chargingDuration,disconnectTime,doneChargingTime,kWhDelivered,sessionID,stationID,userID,milesRequested,requestedDeparture,WhPerMile,minutesAvailable,kWhRequested,modifiedAt,isSiteOne,dayOfWeek,month
0,5e23b149f9af8b5fe4b973cf,2020-01-02 05:08:54-08:00,6.039167,4.378056,2020-01-02 11:11:15-08:00,2020-01-02 09:31:35-08:00,25.016,1_1_179_810_2020-01-02 13:08:53.870034,1-1-179-810,194.0,100.0,2020-01-02 12:51:54-08:00,250.0,463.0,25.00,2020-01-02 05:09:39-08:00,1,Thursday,January
1,5e23b149f9af8b5fe4b973d0,2020-01-02 05:36:50-08:00,9.025278,6.687500,2020-01-02 14:38:21-08:00,2020-01-02 12:18:05-08:00,33.097,1_1_193_825_2020-01-02 13:36:49.599853,1-1-193-825,4275.0,250.0,2020-01-02 15:31:50-08:00,280.0,595.0,70.00,2020-01-02 05:37:11-08:00,1,Thursday,January
2,5e23b149f9af8b5fe4b973d1,2020-01-02 05:56:35-08:00,10.713056,2.641944,2020-01-02 16:39:22-08:00,2020-01-02 08:35:06-08:00,6.521,1_1_193_829_2020-01-02 13:56:35.214993,1-1-193-829,344.0,20.0,2020-01-02 06:56:35-08:00,400.0,60.0,8.00,2020-01-02 05:57:17-08:00,1,Thursday,January
3,5e23b149f9af8b5fe4b973d2,2020-01-02 05:59:58-08:00,2.644722,1.313056,2020-01-02 08:38:39-08:00,2020-01-02 07:18:45-08:00,2.355,1_1_193_820_2020-01-02 13:59:58.309319,1-1-193-820,1117.0,20.0,2020-01-02 07:04:58-08:00,400.0,65.0,8.00,2020-01-02 06:00:03-08:00,1,Thursday,January
4,5e23b149f9af8b5fe4b973d3,2020-01-02 06:00:01-08:00,8.144167,4.291389,2020-01-02 14:08:40-08:00,2020-01-02 10:17:30-08:00,13.375,1_1_193_819_2020-01-02 14:00:00.779967,1-1-193-819,334.0,40.0,2020-01-02 14:24:01-08:00,400.0,504.0,16.00,2020-01-02 06:00:13-08:00,1,Thursday,January
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65032,5d2fbdd3f9af8b4d0dd0d54a,2019-07-01 14:49:12-07:00,3.801667,1.422500,2019-07-01 18:37:18-07:00,2019-07-01 16:14:33-07:00,8.399,1_1_179_798_2019-07-01 21:49:11.873404,1-1-179-798,1346.0,80.0,2019-07-01 15:25:12-07:00,400.0,36.0,32.00,2019-07-01 14:49:37-07:00,1,Monday,July
65033,5d2fbdd3f9af8b4d0dd0d54b,2019-07-01 14:58:45-07:00,2.684167,2.693333,2019-07-01 17:39:48-07:00,2019-07-01 17:40:21-07:00,16.864,1_1_179_794_2019-07-01 21:58:44.571011,1-1-179-794,364.0,100.0,2019-07-01 18:52:45-07:00,400.0,234.0,40.00,2019-07-01 14:59:09-07:00,1,Monday,July
65034,5d2fbdd3f9af8b4d0dd0d54c,2019-07-01 15:02:21-07:00,2.941389,2.950556,2019-07-01 17:58:50-07:00,2019-07-01 17:59:23-07:00,18.335,1_1_191_807_2019-07-01 22:02:20.810735,1-1-191-807,2050.0,90.0,2019-07-01 20:11:21-07:00,333.0,309.0,29.97,2019-07-01 15:02:37-07:00,1,Monday,July
65035,5d2fbdd3f9af8b4d0dd0d54d,2019-07-01 15:23:44-07:00,3.664444,3.672778,2019-07-01 19:03:36-07:00,2019-07-01 19:04:06-07:00,22.815,1_1_179_781_2019-07-01 22:23:32.496137,1-1-179-781,1626.0,170.0,2019-07-01 17:02:44-07:00,200.0,99.0,34.00,2019-07-01 15:23:43-07:00,1,Monday,July


# Dimensionality Reduction

To reduce the dimensionality of our dataset, we will remove certain variables.

## Domain Knowledge

We will drop columns from which we don't expect to gain any insights from the data for our analysis. For example, we don't expect unique identifiers for the records (*id*, *sessionID*) to be useful. Further, since we don't want to analyze per *user* or *station*, we can also drop the columns *userID* as well as *stationID*. The date *modifiedAt* also seems to be useless. We have also derived the *connectionDuration* and *chargingDuration* from the *disconnectTime* and *doneChargingTime*, so we can drop the latter two since their information would be redundant.

In [19]:
df = df.drop(columns=["id", "sessionID", "stationID", "userID", "modifiedAt", "disconnectTime", "doneChargingTime"])

Let us also observe whether the *requestedDeparture* column is redundant:

In [20]:
print("Records where requestedDeparture deviates more than 1min from connectionTime + minutesAvailable: ",
      len(df[(df['requestedDeparture'].notnull()) & (abs(df['requestedDeparture'] - (df['connectionTime'] + pd.to_timedelta(df['minutesAvailable'], unit='m'))) > pd.Timedelta('1 minute'))]))
print("Records where requestedDeparture deviates less than (or equal to) 1min from connectionTime + minutesAvailable: ",
      len(df[(df['requestedDeparture'].notnull()) & (abs(df['requestedDeparture'] - (df['connectionTime'] + pd.to_timedelta(df['minutesAvailable'], unit='m'))) <= pd.Timedelta('1 minute'))]))

Records where requestedDeparture deviates more than 1min from connectionTime + minutesAvailable:  0
Records where requestedDeparture deviates less than (or equal to) 1min from connectionTime + minutesAvailable:  47822


We see that we can derive the requestedDeparture from the connectionTime and minutesAvailable, so we will drop it as well.

In [21]:
df = df.drop(columns=["requestedDeparture"])

Similarly, we also suspect that we can derive the *milesRequested* by computing kWhRequested * 1000 / WhPerMile.

In [22]:
print("Records where milesRequested deviates more than 1mile from kWhRequested * 1000 / WhPerMile: ",
      len(df[(df['milesRequested'].notnull()) & (abs(df['milesRequested'] - (df['kWhRequested'] * 1000 / df['WhPerMile'])) > 1)]))
print("Records where milesRequested deviates less than (or equal to) 1mile from kWhRequested * 1000 / WhPerMile: ",
      len(df[(df['milesRequested'].notnull()) & (abs(df['milesRequested'] - (df['kWhRequested'] * 1000 / df['WhPerMile'])) <= 1)]))

Records where milesRequested deviates more than 1mile from kWhRequested * 1000 / WhPerMile:  0
Records where milesRequested deviates less than (or equal to) 1mile from kWhRequested * 1000 / WhPerMile:  47822


Again, we can drop this column

In [23]:
df = df.drop(columns=["milesRequested"])

In [24]:
df

Unnamed: 0,connectionTime,connectionDuration,chargingDuration,kWhDelivered,WhPerMile,minutesAvailable,kWhRequested,isSiteOne,dayOfWeek,month
0,2020-01-02 05:08:54-08:00,6.039167,4.378056,25.016,250.0,463.0,25.00,1,Thursday,January
1,2020-01-02 05:36:50-08:00,9.025278,6.687500,33.097,280.0,595.0,70.00,1,Thursday,January
2,2020-01-02 05:56:35-08:00,10.713056,2.641944,6.521,400.0,60.0,8.00,1,Thursday,January
3,2020-01-02 05:59:58-08:00,2.644722,1.313056,2.355,400.0,65.0,8.00,1,Thursday,January
4,2020-01-02 06:00:01-08:00,8.144167,4.291389,13.375,400.0,504.0,16.00,1,Thursday,January
...,...,...,...,...,...,...,...,...,...,...
65032,2019-07-01 14:49:12-07:00,3.801667,1.422500,8.399,400.0,36.0,32.00,1,Monday,July
65033,2019-07-01 14:58:45-07:00,2.684167,2.693333,16.864,400.0,234.0,40.00,1,Monday,July
65034,2019-07-01 15:02:21-07:00,2.941389,2.950556,18.335,333.0,309.0,29.97,1,Monday,July
65035,2019-07-01 15:23:44-07:00,3.664444,3.672778,22.815,200.0,99.0,34.00,1,Monday,July


## Invalid Data

Now we also want to check for records that are invalid (and don't make any sense).

In [25]:
print(f"Amount of records with negative connectionDuration: {len(df[df['connectionDuration'] <= 0])}")
print(f"Amount of records with negative chargingDuration: {len(df[df['chargingDuration'] <= 0])}")
print(f"Amount of records with negative kWhDelivered: {len(df[df['kWhDelivered'] <= 0])}")
print(f"Amount of records with negative kWhRequested: {len(df[df['WhPerMile'] <= 0])}")
print(f"Amount of records with negative kWhRequested: {len(df[df['kWhRequested'] <= 0])}")

Amount of records with negative connectionDuration: 0
Amount of records with negative chargingDuration: 29
Amount of records with negative kWhDelivered: 0
Amount of records with negative kWhRequested: 0
Amount of records with negative kWhRequested: 17


Let's we got so few of them, let's get rid of them:

In [26]:
df = df[((df['chargingDuration'] > 0) | (df['chargingDuration'].isnull())) & ((df['kWhRequested'] > 0) | (df['kWhRequested'].isnull()))]
df

Unnamed: 0,connectionTime,connectionDuration,chargingDuration,kWhDelivered,WhPerMile,minutesAvailable,kWhRequested,isSiteOne,dayOfWeek,month
0,2020-01-02 05:08:54-08:00,6.039167,4.378056,25.016,250.0,463.0,25.00,1,Thursday,January
1,2020-01-02 05:36:50-08:00,9.025278,6.687500,33.097,280.0,595.0,70.00,1,Thursday,January
2,2020-01-02 05:56:35-08:00,10.713056,2.641944,6.521,400.0,60.0,8.00,1,Thursday,January
3,2020-01-02 05:59:58-08:00,2.644722,1.313056,2.355,400.0,65.0,8.00,1,Thursday,January
4,2020-01-02 06:00:01-08:00,8.144167,4.291389,13.375,400.0,504.0,16.00,1,Thursday,January
...,...,...,...,...,...,...,...,...,...,...
65032,2019-07-01 14:49:12-07:00,3.801667,1.422500,8.399,400.0,36.0,32.00,1,Monday,July
65033,2019-07-01 14:58:45-07:00,2.684167,2.693333,16.864,400.0,234.0,40.00,1,Monday,July
65034,2019-07-01 15:02:21-07:00,2.941389,2.950556,18.335,333.0,309.0,29.97,1,Monday,July
65035,2019-07-01 15:23:44-07:00,3.664444,3.672778,22.815,200.0,99.0,34.00,1,Monday,July


## Missing Data (again)

Remember that the *doneChargingTime* was missing sometimes? Guess what, with the help of the chargingDuration and connectionDuration, we will now impute this value. The thought process behind it is this: We can't really compute the *chargingDuration* by a mean of all values or something as this could lead to a lot of wrong values (chargingDurations might be longer than the connectionDuration). We come up with another approach: We calculate the average ratio of chargingDuration / connectionDuration. Since we always got the connectionDuration, we can impute the chargingDuration by multiplying it with the computed ratio.

In [29]:
connectionDurationSum = df[df['chargingDuration'].notnull()].connectionDuration.sum()
chargingDurationSum = df[df['chargingDuration'].notnull()].chargingDuration.sum()
print(chargingDurationSum / connectionDurationSum)
df.loc[df['chargingDuration'].isnull(), 'chargingDuration'] = df.connectionDuration * (chargingDurationSum / connectionDurationSum)
df.describe()

0.5529452055320566


Unnamed: 0,connectionDuration,chargingDuration,kWhDelivered,WhPerMile,minutesAvailable,kWhRequested,isSiteOne
count,64991.0,64991.0,64991.0,47799.0,47799.0,47799.0,64991.0
mean,6.275194,3.469838,11.737721,361.491621,357.413021,25.09885,0.517179
std,4.882291,3.028205,10.279735,101.926041,196.295697,20.785285,0.499709
min,0.034444,0.000833,0.501,50.0,1.0,0.256,0.0
25%,2.833889,1.558611,5.063,288.0,192.0,12.0,0.0
50%,6.161667,2.735278,9.103,357.0,360.0,18.78,1.0
75%,9.206944,4.719028,14.1275,400.0,497.0,32.0,1.0
max,245.269167,200.015833,108.797242,2000.0,10062.0,215.32,1.0


In [30]:
df.to_csv('resources/csv_files/charging_sessions_prepared.csv', index=False)
df.to_pickle('resources/pickle_files/charging_sessions_prepared.pkl')