# FINAL PROJECT
## CHAPTER 1 - DATA  PREPARATION - OBTAINING MINEABLE VIEW

# STEP 1 - PREPARE THE ENVIRONMENT

In [3]:
# swich to english 
Sys.setenv(LANG = "en")

# prepare environmental variables for the project
WORKING_DIRECTORY <- "C:\\Users\\User\\Desktop\\Studia\\ERASMUS\\DATA_SCIENCE\\FINAL PROJECT\\"
DATASET_PATH <- "datasets\\"

# Files names 
RACE_2021 <- "RACETRACK_2021_DATA\\18-09-2021_THE_RACE_24H.csv"
FIRST_TESTS <- "RACETRACK_2021_DATA\\16-09-2021_FIRST_TEST_2H_NO_GEOMETRY.csv"
#SECOND_TEST <- "DATA NOT AVAILABLE YET"

# import important libraries
library(ggplot2)
library(gridExtra)
library(reshape2)
library(plyr)

_____________________________
# STEP 2 - LOAD THE DATASET

In [4]:
race_2021_df <- read.csv(
    paste(WORKING_DIRECTORY, DATASET_PATH, RACE_2021, sep=''),
    header=TRUE, 
    sep=';')
head(race_2021_df)

X_id,X_t,General.Timestamp,General.ReceivedTime,General.ThrottlePosition,General.MotorController,General.MainSwitch,General.MotorDirection,General.PowerEco,General.RegenerationBrake,...,RawBytes.285,RawBytes.286,RawBytes.287,RawBytes.288,RawBytes.289,RawBytes.290,RawBytes.291,RawBytes.292,RawBytes.293,RawBytes.294
6145c6b1f02fbf4eb5a5759c,RecentCar,18.09.2021 13:00,18.09.2021 13:00,0,5,PRAWDA,FAŁSZ,PRAWDA,0,...,215,163,112,61,10,215,211,63,3,2
6145c6b2f02fbf4eb5a5759d,RecentCar,18.09.2021 13:00,18.09.2021 13:00,0,5,PRAWDA,FAŁSZ,PRAWDA,0,...,215,163,112,61,10,215,211,63,3,2
6145c6b3f02fbf4eb5a5759e,RecentCar,18.09.2021 13:00,18.09.2021 13:00,0,5,PRAWDA,FAŁSZ,PRAWDA,0,...,215,163,112,61,10,215,211,63,3,2
6145c6b4f02fbf4eb5a5759f,RecentCar,18.09.2021 13:00,18.09.2021 13:00,0,5,PRAWDA,FAŁSZ,PRAWDA,0,...,215,163,112,61,10,215,211,63,3,2
6145c6b5f02fbf4eb5a575a0,RecentCar,18.09.2021 13:00,18.09.2021 13:00,0,5,PRAWDA,FAŁSZ,PRAWDA,0,...,215,163,112,61,10,215,211,63,3,2
6145c6b6f02fbf4eb5a575a1,RecentCar,18.09.2021 13:00,18.09.2021 13:00,0,5,PRAWDA,FAŁSZ,PRAWDA,0,...,215,163,112,61,10,215,211,63,3,2


___________________
# STEP 3 - DATA CLEANSING - BASED ON KNOWLEDGE

## >>> 3.1 - Cleaning the data based on the experts knowledge

    Based on the experience of the team, we are able to drop lots of columns 
    which we know will do not influence the final result beacuese they:
        1. Were not implemented in the vehicle correctly
        2. Hardware was malfunctioning
        3. Were collected for software development purposes
        4. Does not influence the final result

In [5]:
ncol(race_2021_df)

Our initial number of columns is __490__. <br>
Almost half of them will not be usefull at all and we are dropping them.

In [6]:
names <- colnames(race_2021_df)
cat(names, sep="\n")

X_id
X_t
General.Timestamp
General.ReceivedTime
General.ThrottlePosition
General.MotorController
General.MainSwitch
General.MotorDirection
General.PowerEco
General.RegenerationBrake
General.CruiseThrottle
General.CruiseDesiredSpeed
General.BatteryError
General.EngineError
General.DriveMode
General.CruiseEngaged
General.Horn
General.HandBrake
General.Temperatures.0
General.Temperatures.1
General.Temperatures.2
General.Temperatures.3
General.Rpm
General.SolarRadiance
General.Speed
General.Mileage
General.MotorTemperature
Battery.RemainingChargeTime
Battery.ChargerEnabled
Battery.SystemState
Battery.InputOutputState
Battery.PackCRate
Battery.StateOfCharge
Battery.StateOfHealth
Battery.NumberOfCellsConnected
Battery.RemainingEnergy
Battery.DeviationOfVoltageInCells
Battery.PackTemperatureMax
Battery.LMUNumberWithMaxTemperature
Battery.PackTemperatureMin
Battery.LMUNumberWithMinTemperature
Battery.CellVoltageMax
Battery.CellNumberWithMaxVoltage
Battery.CellVoltageMin
Battery.CellNumberWithM

In [7]:
# Drop columns with Row Bytes (it is just a backup in case of data parsing failure)
race_2021_clean_df = race_2021_df[,!grepl("^RawBytes",names(race_2021_df))]

# Drop columns with Tyres (due to lack of hardware)
race_2021_clean_df = race_2021_clean_df[,!grepl("^Tires",names(race_2021_clean_df))]

# Drop GPS data (due to failure during 18-19 September ONLY)
race_2021_clean_df = race_2021_clean_df[,!grepl("^Gps",names(race_2021_clean_df))]

# Drop columns with warnings - as they are based on numerical values received in the data
race_2021_clean_df = race_2021_clean_df[,!grepl("^Battery.Warnings",names(race_2021_clean_df))]

# Drop columns with errors - as they are based on numerical values received in the data
race_2021_clean_df = race_2021_clean_df[,!grepl("^Battery.Errors",names(race_2021_clean_df))]

# Drop columns with lights - as they does not influence the final result due to negligible power impact
race_2021_clean_df = race_2021_clean_df[,!grepl("^Lights",names(race_2021_clean_df))]

# Other columns containing misleading / failed information
cols_to_drop <- c(
    "X_id",  # DB ID
    "X_t",    # DB Info
    "General.ReceivedTime",          # Redundant with General.Timestamp
    "General.CruiseThrottle",        # Not used during the race
    "General.CruiseDesiredSpeed",    # Not used during the race
    "General.EngineError",           # Not implemented
    "General.CruiseEngaged",         # Not used during the race
    "General.HandBrake",             # Probably not working - Verify
    "General.Temperatures.0",        # Not implemented           
    "General.Temperatures.1",        # Not implemented
    "General.Temperatures.2",        # Not implemented
    "General.Temperatures.3",        # Not implemented
    "General.SolarRadiance",         # Not implemented
    "General.BatteryError",          # Not implemented
    "LeftEngine",                    # Not implemented
    "RightEngine",                   # Not implemented
    "Solar.MpptPcbTemperature.0",    # Not implemented
    "Solar.MpptPcbTemperature.1",    # Not implemented
    "Solar.MpptPcbTemperature.2",    # Not implemented
    "Solar.MpptPcbTemperature.3",    # Not implemented
    "Solar.MpptMosfetTemperature.0", # Not implemented
    "Solar.MpptMosfetTemperature.1", # Not implemented
    "Solar.MpptMosfetTemperature.2", # Not implemented
    "Solar.MpptMosfetTemperature.3"  # Not implemented
)

race_2021_clean_df = race_2021_clean_df[,!(names(race_2021_clean_df) %in% cols_to_drop)]

In [8]:
# Print the available columns
ncol(race_2021_clean_df)
names_clean <- colnames(race_2021_clean_df)

### THE MOST USELESS COLUMNS HAS BEEN DROPPED
Now we need to investigate the issues with other columns (if exist)

____________________________________________________

# STEP 4 - DATA CLEANSING - BASED ON DATA

## >>> 4.1 - Timestamp
Timestamp was imported as a factor - we want to have it in a good for data mining way - as a date

In [36]:
str(race_2021_clean_df$General.Timestamp)

 POSIXct[1:72471], format: "2021-09-18 13:00:00" "2021-09-18 13:00:00" "2021-09-18 13:00:00" ...


In [33]:
# "18.09.2021 13:00"
race_2021_clean_df$General.Timestamp <- strptime(
    x=as.character(race_2021_clean_df$General.Timestamp),
    format = "%d.%m.%Y %H:%M")
race_2021_clean_df$General.Timestamp <- as.POSIXct(x=race_2021_clean_df$General.Timestamp)

In [35]:
str(race_2021_clean_df$General.Timestamp)

 POSIXct[1:72471], format: "2021-09-18 13:00:00" "2021-09-18 13:00:00" "2021-09-18 13:00:00" ...


## >>> 4.2 - Data translation 
As the data was collected in Polish - all boolean variables must be translated to obtain a clear minable view

In [9]:
str(race_2021_clean_df[,0:5])

'data.frame':	72471 obs. of  5 variables:
 $ General.Timestamp       : Factor w/ 1392 levels "18.09.2021 13:00",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ General.ThrottlePosition: int  0 0 0 0 0 0 0 0 0 0 ...
 $ General.MotorController : int  5 5 5 5 5 5 5 5 5 5 ...
 $ General.MainSwitch      : Factor w/ 2 levels "FAŁSZ","PRAWDA": 2 2 2 2 2 2 2 2 2 2 ...
 $ General.MotorDirection  : Factor w/ 2 levels "FAŁSZ","PRAWDA": 1 1 1 1 1 1 1 1 1 1 ...


In [10]:
head(race_2021_clean_df)

General.Timestamp,General.ThrottlePosition,General.MotorController,General.MainSwitch,General.MotorDirection,General.PowerEco,General.RegenerationBrake,General.DriveMode,General.Horn,General.Rpm,...,Solar.MpptInputCurrent.3,Solar.MpptOutputVoltage.0,Solar.MpptOutputVoltage.1,Solar.MpptOutputVoltage.2,Solar.MpptOutputVoltage.3,Solar.MpptOutputPower.0,Solar.MpptOutputPower.1,Solar.MpptOutputPower.2,Solar.MpptOutputPower.3,Solar.PowerMppt
18.09.2021 13:00,0,5,PRAWDA,FAŁSZ,PRAWDA,0,2,0,0,...,1784.162,124,124,123,125,154761,787,2190,75253,232991
18.09.2021 13:00,0,5,PRAWDA,FAŁSZ,PRAWDA,0,2,0,0,...,1784.162,124,124,123,125,154761,787,2190,75253,232991
18.09.2021 13:00,0,5,PRAWDA,FAŁSZ,PRAWDA,0,2,0,0,...,1784.162,124,124,123,125,154761,787,2190,75253,232991
18.09.2021 13:00,0,5,PRAWDA,FAŁSZ,PRAWDA,0,2,0,0,...,1784.162,124,124,123,125,154761,787,2190,75253,232991
18.09.2021 13:00,0,5,PRAWDA,FAŁSZ,PRAWDA,0,2,0,0,...,1784.162,124,124,123,125,154761,787,2190,75253,232991
18.09.2021 13:00,0,5,PRAWDA,FAŁSZ,PRAWDA,0,2,0,0,...,1784.162,124,124,123,125,154761,787,2190,75253,232991


In [11]:
race_2021_clean_df[] <- lapply(
    race_2021_clean_df, 
    function(x) 
        if(all(x %in% c('PRAWDA', 'FAŁSZ'), na.rm = TRUE)) x == 'PRAWDA' else x)

In [12]:
str(race_2021_clean_df[,0:5])

'data.frame':	72471 obs. of  5 variables:
 $ General.Timestamp       : Factor w/ 1392 levels "18.09.2021 13:00",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ General.ThrottlePosition: int  0 0 0 0 0 0 0 0 0 0 ...
 $ General.MotorController : int  5 5 5 5 5 5 5 5 5 5 ...
 $ General.MainSwitch      : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
 $ General.MotorDirection  : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...


In [13]:
head(race_2021_clean_df)

General.Timestamp,General.ThrottlePosition,General.MotorController,General.MainSwitch,General.MotorDirection,General.PowerEco,General.RegenerationBrake,General.DriveMode,General.Horn,General.Rpm,...,Solar.MpptInputCurrent.3,Solar.MpptOutputVoltage.0,Solar.MpptOutputVoltage.1,Solar.MpptOutputVoltage.2,Solar.MpptOutputVoltage.3,Solar.MpptOutputPower.0,Solar.MpptOutputPower.1,Solar.MpptOutputPower.2,Solar.MpptOutputPower.3,Solar.PowerMppt
18.09.2021 13:00,0,5,True,False,True,0,2,0,0,...,1784.162,124,124,123,125,154761,787,2190,75253,232991
18.09.2021 13:00,0,5,True,False,True,0,2,0,0,...,1784.162,124,124,123,125,154761,787,2190,75253,232991
18.09.2021 13:00,0,5,True,False,True,0,2,0,0,...,1784.162,124,124,123,125,154761,787,2190,75253,232991
18.09.2021 13:00,0,5,True,False,True,0,2,0,0,...,1784.162,124,124,123,125,154761,787,2190,75253,232991
18.09.2021 13:00,0,5,True,False,True,0,2,0,0,...,1784.162,124,124,123,125,154761,787,2190,75253,232991
18.09.2021 13:00,0,5,True,False,True,0,2,0,0,...,1784.162,124,124,123,125,154761,787,2190,75253,232991


## >>> 4.3 -  Data cleaning - verification of the data

In [37]:
summary(race_2021_clean_df)

 General.Timestamp             General.ThrottlePosition General.MotorController
 Min.   :2021-09-18 13:00:00   Min.   :  0.00           Min.   :0.000          
 1st Qu.:2021-09-18 18:44:00   1st Qu.:  0.00           1st Qu.:1.000          
 Median :2021-09-19 00:43:00   Median : 82.00           Median :1.000          
 Mean   :2021-09-19 00:43:35   Mean   : 99.77           Mean   :2.705          
 3rd Qu.:2021-09-19 06:34:00   3rd Qu.:255.00           3rd Qu.:5.000          
 Max.   :2021-09-19 13:00:00   Max.   :255.00           Max.   :5.000          
 General.MainSwitch General.MotorDirection General.PowerEco
 Mode :logical      Mode :logical          Mode :logical   
 FALSE:640          FALSE:70026            FALSE:42641     
 TRUE :71831        TRUE :2445             TRUE :29830     
                                                           
                                                           
                                                           
 General.Regeneratio

In [38]:
# Besed on information from summary drop columns with no value
cols_to_drop <- c(
    "Battery.ChargerEnabled",  # only falses - car was not charging
    "Battery.StateOfCharge",   # data corrupted - values should be from 100 to 0, are from 17 to 0
    "Battery.NumberOfCellsConnected", # no data
    "Battery.RemainingChargeTime", # data not distributed correctly
    "Battery.RemainingEnergy" # data not distributed correctly
)

race_2021_clean_df = race_2021_clean_df[,!(names(race_2021_clean_df) %in% cols_to_drop)]
ncol(race_2021_clean_df)

# STEP 5 - DATA TRANSFORMATION
Some data can be better extracted from the data


## >>> 5.1 - Accurate battery voltage 
The variable in the dataset "PackVoltage" was not precise, it was expressed as integer. <br>
Having voltage of each cell, we are able to create a new, better variable

In [39]:
cols_to_sum <- c(
    'Battery.BatteryCells.CellsGroups.0.Voltages.0',
    'Battery.BatteryCells.CellsGroups.0.Voltages.1',
    'Battery.BatteryCells.CellsGroups.0.Voltages.2',
    'Battery.BatteryCells.CellsGroups.0.Voltages.3',
    'Battery.BatteryCells.CellsGroups.0.Voltages.4',
    'Battery.BatteryCells.CellsGroups.0.Voltages.5',
    'Battery.BatteryCells.CellsGroups.0.Voltages.6',
    'Battery.BatteryCells.CellsGroups.0.Voltages.7',
    
    'Battery.BatteryCells.CellsGroups.1.Voltages.0',
    'Battery.BatteryCells.CellsGroups.1.Voltages.1',
    'Battery.BatteryCells.CellsGroups.1.Voltages.2',
    'Battery.BatteryCells.CellsGroups.1.Voltages.3',
    'Battery.BatteryCells.CellsGroups.1.Voltages.4',
    'Battery.BatteryCells.CellsGroups.1.Voltages.5',
    'Battery.BatteryCells.CellsGroups.1.Voltages.6',
    'Battery.BatteryCells.CellsGroups.1.Voltages.7',
    
    'Battery.BatteryCells.CellsGroups.2.Voltages.0',
    'Battery.BatteryCells.CellsGroups.2.Voltages.1',
    'Battery.BatteryCells.CellsGroups.2.Voltages.2',
    'Battery.BatteryCells.CellsGroups.2.Voltages.3',
    'Battery.BatteryCells.CellsGroups.2.Voltages.4',
    'Battery.BatteryCells.CellsGroups.2.Voltages.5',
    'Battery.BatteryCells.CellsGroups.2.Voltages.6',
    'Battery.BatteryCells.CellsGroups.2.Voltages.7',
    
    'Battery.BatteryCells.CellsGroups.3.Voltages.0',
    'Battery.BatteryCells.CellsGroups.3.Voltages.1',
    'Battery.BatteryCells.CellsGroups.3.Voltages.2',
    'Battery.BatteryCells.CellsGroups.3.Voltages.3',
    'Battery.BatteryCells.CellsGroups.3.Voltages.4',
    'Battery.BatteryCells.CellsGroups.3.Voltages.5',
    'Battery.BatteryCells.CellsGroups.3.Voltages.6',
    'Battery.BatteryCells.CellsGroups.3.Voltages.7')

race_2021_clean_df$Calc.BatteryVoltage <- rowSums(race_2021_clean_df[, cols_to_sum]) / 1000 # to get Volts
head(race_2021_clean_df$Calc.BatteryVoltage)
tail(race_2021_clean_df$Calc.BatteryVoltage)

## >>> 5.2 - Accurate Timestamp
Due to malfunction during data extraction the precise data about the timestamp has been lost. We are going to improve TimeStamp by adding the information about seconds and miliseconds of the measuremants, knowing that they has been collected with the constant frequency 

In [40]:
head(race_2021_clean_df,10)

General.Timestamp,General.ThrottlePosition,General.MotorController,General.MainSwitch,General.MotorDirection,General.PowerEco,General.RegenerationBrake,General.DriveMode,General.Horn,General.Rpm,...,Solar.MpptOutputVoltage.0,Solar.MpptOutputVoltage.1,Solar.MpptOutputVoltage.2,Solar.MpptOutputVoltage.3,Solar.MpptOutputPower.0,Solar.MpptOutputPower.1,Solar.MpptOutputPower.2,Solar.MpptOutputPower.3,Solar.PowerMppt,Calc.BatteryVoltage
2021-09-18 13:00:00,0,5,True,False,True,0,2,0,0,...,124,124,123,125,154761,787,2190,75253,232991,124.773
2021-09-18 13:00:00,0,5,True,False,True,0,2,0,0,...,124,124,123,125,154761,787,2190,75253,232991,124.772
2021-09-18 13:00:00,0,5,True,False,True,0,2,0,0,...,124,124,123,125,154761,787,2190,75253,232991,124.773
2021-09-18 13:00:00,0,5,True,False,True,0,2,0,0,...,124,124,123,125,154761,787,2190,75253,232991,124.773
2021-09-18 13:00:00,0,5,True,False,True,0,2,0,0,...,124,124,123,125,154761,787,2190,75253,232991,124.772
2021-09-18 13:00:00,0,5,True,False,True,0,2,0,0,...,124,124,123,125,154761,787,2190,75253,232991,124.773
2021-09-18 13:00:00,0,5,True,False,True,0,2,0,0,...,124,124,123,125,154761,787,2190,75253,232991,124.774
2021-09-18 13:00:00,0,5,True,False,True,0,2,0,0,...,124,124,123,125,154761,787,2190,75253,232991,124.773
2021-09-18 13:00:00,0,5,True,False,True,0,2,0,0,...,124,124,123,125,154761,787,2190,75253,232991,124.773
2021-09-18 13:00:00,0,5,True,False,True,0,2,0,0,...,124,124,123,125,154761,787,2190,75253,232991,124.773


In [77]:
initialTimestamp <- race_2021_clean_df[1, 'General.Timestamp']

rowIndex <- 1
for(i in 1:nrow(race_2021_clean_df)){
    tempIndex <- i
    tempTimestamp <- race_2021_clean_df[i, 'General.Timestamp']
    time_difference <- tempTimestamp - initialTimestamp
    
    # if we find time difference
    if(time_difference == 1)
    {
        data_rows = tempIndex - rowIndex
        time_delta = 60 / data_rows
#         print(time_delta)
        for(j in rowIndex: i)
        {
#             print(race_2021_clean_df[j, 'General.Timestamp'] + j * time_delta)
        }
        rowIndex <- i
        initialTimestamp <- tempTimestamp
    }
    
    if (time_difference > 1)
        {
        print("cipa")
        print(i)
        print(time_difference)
        break
    }
}

[1] "cipa"
[1] 6264
Time difference of 3 mins


In [67]:
race_2021_clean_df$Calc.Timestamp

    [1] "2021-09-18 13:00:00 CEST" "2021-09-18 13:00:00 CEST"
    [3] "2021-09-18 13:00:00 CEST" "2021-09-18 13:00:00 CEST"
    [5] "2021-09-18 13:00:00 CEST" "2021-09-18 13:00:00 CEST"
    [7] "2021-09-18 13:00:00 CEST" "2021-09-18 13:00:00 CEST"
    [9] "2021-09-18 13:00:00 CEST" "2021-09-18 13:00:00 CEST"
   [11] "2021-09-18 13:00:00 CEST" "2021-09-18 13:00:00 CEST"
   [13] "2021-09-18 13:00:00 CEST" "2021-09-18 13:00:00 CEST"
   [15] "2021-09-18 13:00:00 CEST" "2021-09-18 13:00:00 CEST"
   [17] "2021-09-18 13:00:00 CEST" "2021-09-18 13:00:00 CEST"
   [19] "2021-09-18 13:00:00 CEST" "2021-09-18 13:00:00 CEST"
   [21] "2021-09-18 13:00:00 CEST" "2021-09-18 13:00:00 CEST"
   [23] "2021-09-18 13:00:00 CEST" "2021-09-18 13:00:00 CEST"
   [25] "2021-09-18 13:00:00 CEST" "2021-09-18 13:00:00 CEST"
   [27] "2021-09-18 13:00:00 CEST" "2021-09-18 13:00:00 CEST"
   [29] "2021-09-18 13:00:00 CEST" "2021-09-18 13:00:00 CEST"
   [31] "2021-09-18 13:00:00 CEST" "2021-09-18 13:00:00 CEST"
   [33] 

In [68]:
str(race_2021_clean_df$General.Timestamp)

 POSIXct[1:72471], format: "2021-09-18 13:00:00" "2021-09-18 13:00:00" "2021-09-18 13:00:00" ...


# DATA CLEANING DONE - NOW IT IS TIME TO DO SOME MAGIC

In [14]:
race_2021_clean_df_2 <- race_2021_clean_df
race_2021_clean_df_2$General.Timestamp  <- from.timestamp(race_2021_clean_df_2$General.Timestamp)

ERROR: Error in from.timestamp(race_2021_clean_df_2$General.Timestamp): could not find function "from.timestamp"


### Może weźmy dane:
#### DO OPISU BATERII:
1. State of Charge ---> battery voltage
2. Min Cell Voltage
3. Max Cell Voltage
4. Pack Temperature Max
5. Pack Temperature Min


### DO OPISU STYLU JAZDY KIEROWCY
1. Throttle Position
2. Power Balance
3. Regenerative breaking?
4. General.DriveMode
5. General.Speed


### DO OPISU DOŁADOWAŃ
1. MPPT output
2. Regenerative breaking

In [40]:
# read in the packages we'll use
library(keras) # for deep learning
library(tidyverse) # general utility functions
library(caret) # machine learning utility functions

# Estimate battery voltage after serious of data 

Registered S3 method overwritten by 'rvest':
  method            from
  read_xml.response xml2
-- [1mAttaching packages[22m --------------------------------------- tidyverse 1.2.1 --
[32mv[39m [34mtibble [39m 3.1.1       [32mv[39m [34mpurrr  [39m 0.3.4  
[32mv[39m [34mtidyr  [39m 0.8.3       [32mv[39m [34mdplyr  [39m 0.8.0.[31m1[39m
[32mv[39m [34mreadr  [39m 1.3.1       [32mv[39m [34mstringr[39m 1.4.0  
[32mv[39m [34mtibble [39m 3.1.1       [32mv[39m [34mforcats[39m 0.4.0  
"package 'purrr' was built under R version 3.6.3"-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32marrange()[39m   masks [34mplyr[39m::arrange()
[31mx[39m [34mdplyr[39m::[32mcombine()[39m   masks [34mgridExtra[39m::combine()
[31mx[39m [34mpurrr[39m::[32mcompact()[39m   masks [34mplyr[39m::compact()
[31mx[39m [34mdplyr[39m::[32mcount()[39m     masks [34mplyr[39m::count()
[31mx[39m

In [None]:
head(race_2021_clean_df)

In [15]:
colnames(race_2021_clean_df)

In [56]:
library(tensorflow)
library(keras)

MAX_LEN <- 500 # the number of previous examples we will look at to predict next item
BATCH_SIZE <- 32 # the number of distinct sequences to look at at one time during training
TOTAL_EPOCHS <- 5 # how many times we will look at the whole dataset while training

set.seed(123) # for reproductability

In [34]:
cols_to_use <- c(
    "Battery.PackVoltage",
    "Battery.CellVoltageMax",
    "Battery.CellVoltageMax",
    "Battery.DeviationOfVoltageInCells",
    "Battery.CellAvgVoltage",
    "Battery.PackTemperatureMax",
    "Battery.PackTemperatureMin",
    "Battery.PowerBalance")

racing_rnn_battery_df <- race_2021_clean_df[,(names(race_2021_clean_df) %in% cols_to_use)]
head(racing_rnn_battery_df)

Battery.DeviationOfVoltageInCells,Battery.PackTemperatureMax,Battery.PackTemperatureMin,Battery.CellVoltageMax,Battery.CellAvgVoltage,Battery.PackVoltage,Battery.PowerBalance
22,19,18,4.179,4.167,122,97.6
22,19,18,4.179,4.167,122,97.6
22,19,18,4.179,4.167,122,97.6
22,19,18,4.179,4.167,122,97.6
22,19,18,4.179,4.167,122,97.6
22,19,18,4.179,4.167,122,97.6


In [35]:
avg_voltage <- racing_rnn_battery_df$Battery.CellAvgVoltage
summary(avg_voltage)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   3.495   3.716   3.674   3.949   5.000 

In [36]:
start_indexes <- seq(1, length(avg_voltage) - (MAX_LEN + 1), by = 3)
avg_matrix_voltage <- matrix(nrow = length(start_indexes), ncol = MAX_LEN + 1)
for (i in 1:length(start_indexes)){
  avg_matrix_voltage[i,] <- avg_voltage[start_indexes[i]:(start_indexes[i] + MAX_LEN)]
}

In [37]:
# make sure it's numeric
avg_matrix_voltage <- avg_matrix_voltage * 1

# remove na's if you have them
if(anyNA(avg_matrix_voltage)){
    avg_matrix_voltage <- na.omit(avg_matrix_voltage)
}

In [38]:
# split our data into the day we're predict (y), and the 
# sequence of days leading up to it (X)

X <- avg_matrix_voltage[,-ncol(avg_matrix_voltage)]
y <- avg_matrix_voltage[,ncol(avg_matrix_voltage)]

In [41]:
# create an index to split our data into testing & training sets
library(tensorflow)
training_index <- createDataPartition(y, p = .9, 
                                  list = FALSE, 
                                  times = 1)

# training data
X_train <- array(X[training_index,], dim = c(length(training_index), MAX_LEN, 1))
y_train <- y[training_index]

# testing data
X_test <- array(X[-training_index,], dim = c(length(y) - length(training_index), MAX_LEN, 1))
y_test <- y[-training_index]

In [45]:
# initialize our model



In [46]:
# dimensions of our input data
dim(X_train)

In [54]:
# our input layer
model <- keras_model_sequential()

model %>% 
    layer_dense(units=MAX_LEN, input_shape = dim(X_train)[2:3]) %>%
    layer_activation('relu') %>%
    layer_dense(units = 10) %>%
    layer_activation('softmax')

In [52]:
summary(model)

Model: "sequential_3"
________________________________________________________________________________
Layer (type)                        Output Shape                    Param #     
dense_10 (Dense)                    (None, 500, 500)                1000        
________________________________________________________________________________
activation_5 (Activation)           (None, 500, 500)                0           
________________________________________________________________________________
dense_9 (Dense)                     (None, 500, 10)                 5010        
________________________________________________________________________________
activation_4 (Activation)           (None, 500, 10)                 0           
Total params: 6,010
Trainable params: 6,010
Non-trainable params: 0
________________________________________________________________________________


In [67]:
# create metric using backend tensor functions
metric_mean_pred <- custom_metric("mean_pred", function(y_true, y_pred) {
  k_mean(y_pred) 
})

model %>% compile( 
  optimizer = optimizer_rmsprop(),
  loss = loss_binary_crossentropy,
  metrics = c('accuracy', metric_mean_pred)
)

ERROR: Error in py_call_impl(callable, dots$args, dots$keywords): ValueError: ('`tf.compat.v1.keras` Optimizer (', <tensorflow.python.keras.optimizers.RMSprop object at 0x0000000049989A60>, ') is not supported when eager execution is enabled. Use a `tf.keras` Optimizer instead, or disable eager execution.')

Detailed traceback: 
  File "D:\Oprogramowanie_studia\Anaconda_Files\lib\site-packages\tensorflow\python\keras\engine\training.py", line 538, in compile
    self._validate_compile(optimizer, metrics, **kwargs)
  File "D:\Oprogramowanie_studia\Anaconda_Files\lib\site-packages\tensorflow\python\keras\engine\training.py", line 2477, in _validate_compile
    raise ValueError(



In [63]:
# Actually train our model! This step will take a while
trained_model <- model %>% 
    fit(
        x = X_train, 
        y = y_train,
        batch_size = BATCH_SIZE, 
        epochs = TOTAL_EPOCHS, 
        validation_split = 0.1)

ERROR: Error in py_call_impl(callable, dots$args, dots$keywords): RuntimeError: Evaluation error: invalid argument type.

Detailed traceback: 
  File "D:\Oprogramowanie_studia\Anaconda_Files\lib\site-packages\tensorflow\python\keras\engine\training.py", line 108, in _method_wrapper
    return method(self, *args, **kwargs)
  File "D:\Oprogramowanie_studia\Anaconda_Files\lib\site-packages\tensorflow\python\keras\engine\training.py", line 1079, in fit
    callbacks.on_train_begin()
  File "D:\Oprogramowanie_studia\Anaconda_Files\lib\site-packages\tensorflow\python\keras\callbacks.py", line 501, in on_train_begin
    callback.on_train_begin(numpy_logs)
  File "D:\Oprogramowanie_studia\Anaconda_Files\Lib\R\library\keras\python\kerastools\callback.py", line 42, in on_train_begin
    self.r_on_train_begin(logs)
  File "D:/Oprogramowanie_studia/Anaconda_Files/Lib/R/library/reticulate/python\rpytools\call.py", line 21, in python_function
    raise RuntimeError(res[kErrorKey])



In [62]:
head(y_train)

In [None]:
y_train

In [None]:
# get a list of start indexes for our (overlapping) chunks
# start_indexes <- seq(
#     from=1,
#     to=nrow(racing_rnn_battery_df) - (MAX_LEN + 1),
#     by=3)

# fill our matrix with the overlapping slices of our dataset
racing_rnn_battery_mx <- data.matrix(racing_rnn_battery_df)


start_indexes <- seq(
    from=1,
    to=length(racing_rnn_battery_mx) - (MAX_LEN + 1),
    by=3)

# create an empty matrix to store our data in
battery_matrix <- matrix(
    nrow = length(start_indexes), 
    ncol = MAX_LEN + 1)


for (i in 1:length(start_indexes)){
    print(i)
  battery_matrix[i,] <- 
    split(racing_rnn_battery_mx[start_indexes[i]:(start_indexes[i] + MAX_LEN),], 
      seq(nrow(racing_rnn_battery_mx[start_indexes[1]:(start_indexes[i] + MAX_LEN),])))  
}

In [None]:
# fill our matrix with the overlapping slices of our dataset
racing_rnn_battery_df[1,]

In [None]:
racing_rnn_battery_mx[start_indexes[2]:(start_indexes[2] + MAX_LEN),]

In [None]:
ncol(battery_matrix)
nrow(battery_matrix)

In [None]:
split(racing_rnn_battery_mx[start_indexes[1]:(start_indexes[1] + MAX_LEN),], 
      seq(nrow(racing_rnn_battery_mx[start_indexes[1]:(start_indexes[1] + MAX_LEN),])))  



In [None]:
ncol(racing_rnn_battery_df)

In [None]:
length(start_indexes) 

In [None]:
class(battery_matrix)

In [None]:
length(start_indexes)

In [None]:
battery_matrix[5,]

In [None]:
str(race_2021_clean_df)

In [None]:
head(race_2021_clean_df)

# Some nice plots

In [None]:
race_2021_clean_df$newtimestamp <- strptime(race_2021_clean_df$General.Timestamp,format="%dT.%m.%Y %k:%M:%S")

In [None]:
qplot(x=1:nrow(race_2021_clean_df), y=Battery.BatteryCells.CellsGroups.0.Voltages.0, data = race_2021_clean_df, geom = c("point"))

In [None]:
qplot(x=1:nrow(race_2021_clean_df), y=General.Rpm, data = race_2021_clean_df, geom = c("point"))

In [None]:
qplot(x=General.Timestamp, y=General.Rpm, data = race_2021_clean_df[1:3300,], geom = c("point"))
#from first 3300 rows so 1 hour of driving and staying in a pit

In [None]:
qplot(x=General.Timestamp, y=General.ThrottlePosition, data = race_2021_clean_df[1:3300,], geom = c("point"))
#from first 3300 rows so 1 hour of driving and staying in a pit
# lower constant level means ECO-Full throttle; high levels are POWER-Full Throttle

In [None]:
qplot(x=1:nrow(race_2021_clean_df), y=Battery.RemainingEnergy, data = race_2021_clean_df, geom = c("point"))

In [None]:
qplot(x=1:nrow(race_2021_clean_df), y=Battery.PackTemperatureMax, data = race_2021_clean_df, geom = c("point"))

In [None]:
qplot(x=1:nrow(race_2021_clean_df[1:500,]), y=Battery.PowerBalance, data = race_2021_clean_df[1:500,], geom = c("line"))

In [None]:
summary(race_2021_df$Lights.StopLights)

In [None]:
summary(race_2021_df$General.RegenerationBrake)

In [None]:
qplot(x=1:nrow(race_2021_df[1:500,]), y=General.RegenerationBrake, data = race_2021_df[1:500,], geom = c("line"))

In [28]:
library(keras)
model <- keras_model_sequential() 

"path[1]="D:\Oprogramowanie_studia\Anaconda_files\envs\R-env/python.exe": Nie można odnaleźć określonego pliku"

In [29]:
model %>% 
  layer_dense(units = 32, input_shape = c(784)) %>% 
  layer_activation('relu') %>% 
  layer_dense(units = 10) %>% 
  layer_activation('softmax')

In [30]:
summary(model)

Model: "sequential"
________________________________________________________________________________
Layer (type)                        Output Shape                    Param #     
dense_1 (Dense)                     (None, 32)                      25120       
________________________________________________________________________________
activation_1 (Activation)           (None, 32)                      0           
________________________________________________________________________________
dense (Dense)                       (None, 10)                      330         
________________________________________________________________________________
activation (Activation)             (None, 10)                      0           
Total params: 25,450
Trainable params: 25,450
Non-trainable params: 0
________________________________________________________________________________


In [68]:
head(race_2021_clean_df)

General.Timestamp,General.ThrottlePosition,General.MotorController,General.MainSwitch,General.MotorDirection,General.PowerEco,General.RegenerationBrake,General.DriveMode,General.Horn,General.Rpm,...,Solar.MpptInputCurrent.3,Solar.MpptOutputVoltage.0,Solar.MpptOutputVoltage.1,Solar.MpptOutputVoltage.2,Solar.MpptOutputVoltage.3,Solar.MpptOutputPower.0,Solar.MpptOutputPower.1,Solar.MpptOutputPower.2,Solar.MpptOutputPower.3,Solar.PowerMppt
18.09.2021 13:00,0,5,True,False,True,0,2,0,0,...,1784.162,124,124,123,125,154761,787,2190,75253,232991
18.09.2021 13:00,0,5,True,False,True,0,2,0,0,...,1784.162,124,124,123,125,154761,787,2190,75253,232991
18.09.2021 13:00,0,5,True,False,True,0,2,0,0,...,1784.162,124,124,123,125,154761,787,2190,75253,232991
18.09.2021 13:00,0,5,True,False,True,0,2,0,0,...,1784.162,124,124,123,125,154761,787,2190,75253,232991
18.09.2021 13:00,0,5,True,False,True,0,2,0,0,...,1784.162,124,124,123,125,154761,787,2190,75253,232991
18.09.2021 13:00,0,5,True,False,True,0,2,0,0,...,1784.162,124,124,123,125,154761,787,2190,75253,232991


In [69]:
write.csv(race_2021_clean_df, paste(WORKING_DIRECTORY, "cleanRace.csv") , row.names = TRUE)