# Overview of Data Frame Operations

### - Creating Data Frames


In [1]:
empty <- data.frame() # empty data frame

c1 <- 1:10 # vector of integers

c2 <- letters[1:10] # vector of strings

df <- data.frame(col.name.1=c1,col.name.2=c2)

In [2]:
df


col.name.1,col.name.2
1,a
2,b
3,c
4,d
5,e
6,f
7,g
8,h
9,i
10,j


## Importing and Exporting Data

In [None]:
d2 <- read.csv('some.file.name.csv')

# For Excel Files
# Load the readxl package
library(readxl)
# Call info from the sheets using read.excel
df <- read_excel('Sample-Sales-Data.xlsx',sheet='Sheet1')

# Output to csv
write.csv(df, file='some.file.csv')

## Getting Information about Data Frame

In [3]:
# Row and columns counts
nrow(df)
ncol(df)

In [4]:
# Column Names
colnames(df)

In [5]:
# Row names (may just return index)
rownames(df)

## Referencing Cells

In [6]:
# You can think of the basics as using two sets of brackets for a single cell, and a single set of brackets for multiple cells. 
vec <- df[[5, 2]] # get cell by [[row,col]] num

newdf <- df[1:5, 1:2] # get multiplt cells in new df

df[[2, 'col.name.1']] <- 99999 # reassign a single cell

In [7]:
df

col.name.1,col.name.2
1,a
99999,b
3,c
4,d
5,e
6,f
7,g
8,h
9,i
10,j


## Referencing Rows

In [8]:
# Usually you'll use the [row,] format
rowdf <- df[1, ]
rowdf

col.name.1,col.name.2
1,a


In [9]:
# to get a row as a vector, use following
vrow <- as.numeric(as.vector(df[1,]))
vrow

## Referencing Columns

In [10]:
# Most column references return a vector:
cars <- mtcars
head(cars)

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2
Valiant,18.1,6,225,105,2.76,3.46,20.22,1,0,3,1


In [11]:
colv1 <- cars$mpg # returns a vector
colv1

colv2 <- cars[, 'mpg'] # returns vector
colv2

colv3<- cars[, 1] # a is int or string
colv3

colv4 <- cars[['mpg']] # returns a vector
colv4

In [13]:
# Ways of Returning Data Frames
mpgdf <- cars['mpg'] # returns 1 col df
head(mpgdf)

mpgdf2 <- cars[1] # returns 1 col df
head(mpgdf2)

Unnamed: 0,mpg
Mazda RX4,21.0
Mazda RX4 Wag,21.0
Datsun 710,22.8
Hornet 4 Drive,21.4
Hornet Sportabout,18.7
Valiant,18.1


Unnamed: 0,mpg
Mazda RX4,21.0
Mazda RX4 Wag,21.0
Datsun 710,22.8
Hornet 4 Drive,21.4
Hornet Sportabout,18.7
Valiant,18.1


## Adding Rows

In [14]:
# Both arguments are DFs)
df2 <- data.frame(col.name.1=2000,col.name.2='new' )
df2

# use rbind to bind a new row!
dfnew <- rbind(df,df2)

col.name.1,col.name.2
2000,new


In [15]:
dfnew

col.name.1,col.name.2
1,a
99999,b
3,c
4,d
5,e
6,f
7,g
8,h
9,i
10,j


## Adding Columns

In [16]:
df$newcol <- rep(NA, nrow(df)) # NA column
df

col.name.1,col.name.2,newcol
1,a,
99999,b,
3,c,
4,d,
5,e,
6,f,
7,g,
8,h,
9,i,
10,j,


In [17]:
df[, 'copy.of.col2'] <- df$col.name.2 # copy a col
df

col.name.1,col.name.2,newcol,copy.of.col2
1,a,,a
99999,b,,b
3,c,,c
4,d,,d
5,e,,e
6,f,,f
7,g,,g
8,h,,h
9,i,,i
10,j,,j


In [18]:
# Can also use equations!
df[['col1.times.2']] <- df$col.name.1 * 2
df

col.name.1,col.name.2,newcol,copy.of.col2,col1.times.2
1,a,,a,2
99999,b,,b,199998
3,c,,c,6
4,d,,d,8
5,e,,e,10
6,f,,f,12
7,g,,g,14
8,h,,h,16
9,i,,i,18
10,j,,j,20


In [19]:
df3 <- cbind(df, df$col.name.1)
df3

col.name.1,col.name.2,newcol,copy.of.col2,col1.times.2,df$col.name.1
1,a,,a,2,1
99999,b,,b,199998,99999
3,c,,c,6,3
4,d,,d,8,4
5,e,,e,10,5
6,f,,f,12,6
7,g,,g,14,7
8,h,,h,16,8
9,i,,i,18,9
10,j,,j,20,10


## Setting Column Names

In [20]:
# Rename second column
colnames(df)[2] <- 'SECOND COLUMN NEW NAME'
df

# Rename all at once with a vector
colnames(df) <- c('col.name.1', 'col.name.2', 'newcol', 'copy.of.col2' ,'col1.times.2')
df

col.name.1,SECOND COLUMN NEW NAME,newcol,copy.of.col2,col1.times.2
1,a,,a,2
99999,b,,b,199998
3,c,,c,6
4,d,,d,8
5,e,,e,10
6,f,,f,12
7,g,,g,14
8,h,,h,16
9,i,,i,18
10,j,,j,20


col.name.1,col.name.2,newcol,copy.of.col2,col1.times.2
1,a,,a,2
99999,b,,b,199998
3,c,,c,6
4,d,,d,8
5,e,,e,10
6,f,,f,12
7,g,,g,14
8,h,,h,16
9,i,,i,18
10,j,,j,20


## Selecting Multiple Rows

In [21]:
first.ten.rows <- df[1:10, ] # Same as head(df, 10)
first.ten.rows

col.name.1,col.name.2,newcol,copy.of.col2,col1.times.2
1,a,,a,2
99999,b,,b,199998
3,c,,c,6
4,d,,d,8
5,e,,e,10
6,f,,f,12
7,g,,g,14
8,h,,h,16
9,i,,i,18
10,j,,j,20


In [22]:
everything.but.row.two <- df[-2, ]
everything.but.row.two

Unnamed: 0,col.name.1,col.name.2,newcol,copy.of.col2,col1.times.2
1,1,a,,a,2
3,3,c,,c,6
4,4,d,,d,8
5,5,e,,e,10
6,6,f,,f,12
7,7,g,,g,14
8,8,h,,h,16
9,9,i,,i,18
10,10,j,,j,20


In [23]:
# Conditional Selection
sub1 <- df[ (df$col.name.1 > 8 & df$col1.times.2 > 10), ]
sub1

sub2 <- subset(df, col.name.1 > 8 & col1.times.2 > 10)
sub2

Unnamed: 0,col.name.1,col.name.2,newcol,copy.of.col2,col1.times.2
2,99999,b,,b,199998
9,9,i,,i,18
10,10,j,,j,20


Unnamed: 0,col.name.1,col.name.2,newcol,copy.of.col2,col1.times.2
2,99999,b,,b,199998
9,9,i,,i,18
10,10,j,,j,20


## Selecting Multiple Columns

In [24]:
df[, c(1, 2, 3)] #Grab cols 1 2 3

col.name.1,col.name.2,newcol
1,a,
99999,b,
3,c,
4,d,
5,e,
6,f,
7,g,
8,h,
9,i,
10,j,


In [25]:
df[, c('col.name.1', 'col1.times.2')] # by name

col.name.1,col1.times.2
1,2
99999,199998
3,6
4,8
5,10
6,12
7,14
8,16
9,18
10,20


In [26]:
df[, -1] # keep all but first column

col.name.2,newcol,copy.of.col2,col1.times.2
a,,a,2
b,,b,199998
c,,c,6
d,,d,8
e,,e,10
f,,f,12
g,,g,14
h,,h,16
i,,i,18
j,,j,20


In [27]:
df[, -c(1, 3)] # drop cols 1 and 3


col.name.2,copy.of.col2,col1.times.2
a,a,2
b,b,199998
c,c,6
d,d,8
e,e,10
f,f,12
g,g,14
h,h,16
i,i,18
j,j,20


## Dealing with Missing Data

In [28]:
any(is.na(df)) # detect anywhere in df

In [29]:
any(is.na(df$col.name.1)) # anywhere in col

In [30]:
# delete selected missing data rows
df <- df[!is.na(df$col), ]

In [31]:
# replace NAs with something else
df[is.na(df)] <- 0 # works on whole df

In [32]:
df$col[is.na(df$col)] <- 999 # For a selected column