## R Basics 4: DataFrames

In [2]:
# dataframes allow different types and are more flexible than lists 
# rows are observations, columns are features - in data science speak
# elements in columns are same type
# datasets in R are dataframes 

# create dataframe from vectors
days <- c("Sat", "Sun", "Mon", "Tue")
temps <- c(21, 18, 24, 15)
outlook <- c("sunny", "overcast", "cloudy", "rainy")

# vector names become column names in dataframe
df <- data.frame(days, temps, outlook)
df

days,temps,outlook
Sat,21,sunny
Sun,18,overcast
Mon,24,cloudy
Tue,15,rainy


In [3]:
# can label columns differently 
names(df) <- c("Day", "Temperature", "Forecast")
df

Day,Temperature,Forecast
Sat,21,sunny
Sun,18,overcast
Mon,24,cloudy
Tue,15,rainy


In [6]:
# or change names when creating df 
df2 <- data.frame(Day = days, "Temp Deg C" = temps, Forecast = outlook)
df2

Day,Temp.Deg.C,Forecast
Sat,21,sunny
Sun,18,overcast
Mon,24,cloudy
Tue,15,rainy


In [8]:
# R interprets the dataframe with types, including factors where possible
str(df2)

'data.frame':	4 obs. of  3 variables:
 $ Day       : Factor w/ 4 levels "Mon","Sat","Sun",..: 2 3 1 4
 $ Temp.Deg.C: num  21 18 24 15
 $ Forecast  : Factor w/ 4 levels "cloudy","overcast",..: 4 2 1 3


In [37]:
# can over-write assumptions ...
df3 <- data.frame(Day = days, "Temp Deg C" = temps, Forecast = outlook, stringsAsFactors = FALSE)
str(df3)

'data.frame':	4 obs. of  3 variables:
 $ Day       : chr  "Sat" "Sun" "Mon" "Tue"
 $ Temp.Deg.C: num  21 18 24 15
 $ Forecast  : chr  "sunny" "overcast" "cloudy" "rainy"


In [38]:
# subsetting dataframes
df3

Day,Temp.Deg.C,Forecast
Sat,21,sunny
Sun,18,overcast
Mon,24,cloudy
Tue,15,rainy


In [39]:
# 3rd row, 2nd element
df3[3,2] 
# the whole Day column
df3$Day
# 3rd row
df3[3,]
# 3rd column
df3[,3]
# the Day column
df3[,"Day"]
# 2nd col
df3[2]

Unnamed: 0,Day,Temp.Deg.C,Forecast
3,Mon,24,cloudy


Temp.Deg.C
21
18
24
15


In [40]:
# more selection: rows 1, 3 and 4, columns Day and Forecast
df3[c(1,3,4), c("Day", "Forecast")]

Unnamed: 0,Day,Forecast
1,Sat,sunny
3,Mon,cloudy
4,Tue,rainy


In [41]:
# output type is different
df3[[1]]
df3[1]
df3["Day"]
df3$Day
df3[["Day"]]

class(df3[1])
class(df3[[1]])

Day
Sat
Sun
Mon
Tue


Day
Sat
Sun
Mon
Tue


In [42]:
# extendng dataframes 
# adding columns 
humidity <- c(45, 40, 34, 55)
df3$RelativeHumidity <- humidity 
df3

Day,Temp.Deg.C,Forecast,RelativeHumidity
Sat,21,sunny,45
Sun,18,overcast,40
Mon,24,cloudy,34
Tue,15,rainy,55


In [43]:
# can also use cbind and rbind 
wind <- c("S", "W", "SW", "NW")
df4 <- cbind(df3, wind)
df4
# df3 is not changed 
df3

Day,Temp.Deg.C,Forecast,RelativeHumidity,wind
Sat,21,sunny,45,S
Sun,18,overcast,40,W
Mon,24,cloudy,34,SW
Tue,15,rainy,55,NW


Day,Temp.Deg.C,Forecast,RelativeHumidity
Sat,21,sunny,45
Sun,18,overcast,40
Mon,24,cloudy,34
Tue,15,rainy,55


In [44]:
# adding a row using dataframe
newRow <- data.frame(Day = "Wed", "Temp Deg C" = 22, Forecast = "snowy", RelativeHumidity = 48)
df4 <- rbind(df3,newRow)
df4

Day,Temp.Deg.C,Forecast,RelativeHumidity
Sat,21,sunny,45
Sun,18,overcast,40
Mon,24,cloudy,34
Tue,15,rainy,55
Wed,22,snowy,48


In [47]:
# sorting and ranking
sort(df4$RelativeHumidity)

In [49]:
ranks <- order(df4$RelativeHumidity)
ranks

In [50]:
# reverse order (descending)
sort(df4$RelativeHumidity, decreasing = TRUE)

In [52]:
# subset based on criteria, always add , to end
df4[df4$RelativeHumidity > 40,]

Unnamed: 0,Day,Temp.Deg.C,Forecast,RelativeHumidity
1,Sat,21,sunny,45
4,Tue,15,rainy,55
5,Wed,22,snowy,48


In [54]:
# display sorted 
df4[order(df4$RelativeHumidity, decreasing = TRUE),]

Unnamed: 0,Day,Temp.Deg.C,Forecast,RelativeHumidity
4,Tue,15,rainy,55
5,Wed,22,snowy,48
1,Sat,21,sunny,45
2,Sun,18,overcast,40
3,Mon,24,cloudy,34
