# Introduction to DataFrames
**Bogumił Kamiński, 2017**

In [2]:
using DataFrames # load package

## Constructors

In [3]:
DataFrame() # empty DataFrame

In [4]:
DataFrame(A=1:3, B=rand(3), C=randstring.([3,3,3])) # keyword arguments

Unnamed: 0,A,B,C
1,1,0.936671,LVC
2,2,0.941741,XyM
3,3,0.914977,Ujd


In [5]:
x = Dict("A" => [1,2], "B" => [true, false], "C" => ['a', 'b'])
DataFrame(x) # from dictionary, columns will be sorted

Unnamed: 0,A,B,C
1,1,True,'a'
2,2,False,'b'


In [6]:
DataFrame(:A => [1,2], :B => [true, false], :C => ['a', 'b']) # from pairs

Unnamed: 0,A,B,C
1,1,True,'a'
2,2,False,'b'


In [7]:
DataFrame([rand(3) for i in 1:3]) # from vector of vectors

Unnamed: 0,x1,x2,x3
1,0.00322929,0.0838389,0.747746
2,0.872693,0.906353,0.979596
3,0.261985,0.47891,0.843912


In [8]:
DataFrame(rand(3)) # edge case vector of atoms

Unnamed: 0,x1,x2,x3
1,0.15738,0.132175,0.583478


In [9]:
DataFrame(rand(3), [:A, :B, :C]) # pass second argument to give column names

Unnamed: 0,A,B,C
1,0.569573,0.487466,0.461863


In [10]:
DataFrame(rand(3,4)) # from matrix

Unnamed: 0,x1,x2,x3,x4
1,0.261217,0.484157,0.721347,0.718433
2,0.925508,0.0346365,0.679817,0.46339
3,0.0781544,0.486668,0.522358,0.774114


In [11]:
DataFrame([Int, Float64, Any], [:A, :B, :C], 1) # pass column types, names and number of rows
# we get missing because Any >: Missing

Unnamed: 0,A,B,C
1,17,,missing


In [12]:
DataFrame([Int, Float64, String], [:A, :B, :C], 1) # it was created OK, only value for String is #undef

UndefRefError: [91mUndefRefError: access to undefined reference[39m

In [13]:
DataFrame([Int, Float64, String], [:A, :B, :C], 0) # columns are created, but there are no rows

Unnamed: 0,A,B,C


In [14]:
DataFrame(Int, 3, 5) # a quick way to create homogenous DataFrame

Unnamed: 0,x1,x2,x3,x4,x5
1,147437264,147437264,128468752,147437264,109259368
2,116621520,116621520,171576752,116621520,109264904
3,121016992,121017024,116622736,121017088,109260056


In [15]:
DataFrame([Int, Float64], 4) # similar, but with nonhomogenous columns

Unnamed: 0,x1,x2
1,109259368,8.44305e-316
2,109264904,5.77805e-316
3,109261144,2.02364e-315
4,116735696,0.0


In [16]:
x = DataFrame(A = [1, 2], B = [1.0, missing], C = ["a", "b"], D = [1, "a"])
convert(Array, x) # convert DataFrame to Matrix

2×4 Array{Any,2}:
 1  1.0       "a"  1   
 2   missing  "b"   "a"

## Getting basic information about a data frame

In [17]:
x = DataFrame(A = [1, 2], B = [1.0, missing], C = ["a", "b"], D = [1, "a"])

Unnamed: 0,A,B,C,D
1,1,1.0,a,1
2,2,missing,b,a


In [18]:
size(x), size(x, 1), size(x, 2)

((2, 4), 2, 4)

In [19]:
nrow(x), ncol(x), length(x)

(2, 4, 4)

In [20]:
describe(x)

A
Summary Stats:
Mean:           1.500000
Minimum:        1.000000
1st Quartile:   1.250000
Median:         1.500000
3rd Quartile:   1.750000
Maximum:        2.000000
Length:         2
Type:           Int64

B
Summary Stats:
Mean:           1.000000
Minimum:        1.000000
1st Quartile:   1.000000
Median:         1.000000
3rd Quartile:   1.000000
Maximum:        1.000000
Length:         2
Type:           Union{Float64, Missings.Missing}
Number Missing: 1
% Missing:      50.000000

C
Summary Stats:
Length:         2
Type:           String
Number Unique:  2

D
Summary Stats:
Length:         2
Type:           Any
Number Unique:  2
Number Missing: 0
% Missing:      0.000000



In [21]:
showcols(x)

2×4 DataFrames.DataFrame
│ Col # │ Name │ Eltype                           │ Missing │ Values          │
├───────┼──────┼──────────────────────────────────┼─────────┼─────────────────┤
│ 1     │ A    │ Int64                            │ 0       │ 1  …  2         │
│ 2     │ B    │ Union{Float64, Missings.Missing} │ 1       │ 1.0  …  missing │
│ 3     │ C    │ String                           │ 0       │ a  …  b         │
│ 4     │ D    │ Any                              │ 0       │ 1  …  a         │

In [22]:
names(x)

4-element Array{Symbol,1}:
 :A
 :B
 :C
 :D

In [23]:
eltypes(x)

4-element Array{Type,1}:
 Int64                           
 Union{Float64, Missings.Missing}
 String                          
 Any                             

In [27]:
y = DataFrame(rand(1:10, 20, 10))

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10
1,1,10,3,7,9,2,7,8,8,3
2,4,4,7,6,1,4,10,6,7,4
3,1,1,1,10,8,5,3,3,8,10
4,2,8,9,2,1,4,5,8,5,9
5,1,1,4,7,5,4,7,6,4,10
6,8,3,8,4,3,6,3,7,1,3
7,1,4,10,10,9,8,7,6,8,5
8,8,4,7,6,2,1,1,10,10,5
9,8,6,3,8,10,6,3,2,3,1
10,6,7,3,10,1,3,3,10,10,3


In [28]:
head(y)

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10
1,1,10,3,7,9,2,7,8,8,3
2,4,4,7,6,1,4,10,6,7,4
3,1,1,1,10,8,5,3,3,8,10
4,2,8,9,2,1,4,5,8,5,9
5,1,1,4,7,5,4,7,6,4,10
6,8,3,8,4,3,6,3,7,1,3


In [31]:
tail(y, 3)

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10
1,5,7,1,9,4,10,10,6,9,4
2,10,6,7,1,6,3,6,10,4,10
3,1,4,3,4,8,1,4,6,5,4


## Handling missing values

In [32]:
missing, typeof(missing) # sinelton type

(missing, Missings.Missing)

In [34]:
x = [1, 2, missing, 3] # arrays automatically create an appropriate union type

4-element Array{Union{Int64, Missings.Missing},1}:
 1       
 2       
  missing
 3       

In [36]:
ismissing(1), ismissing(missing), ismissing(x), ismissing.(x) # check if variable is missing

(false, true, false, Bool[false, false, true, false])

In [37]:
eltype(x), Missings.T(eltype(x)) # extract the type combined with Missing (useful for arrays)

(Union{Int64, Missings.Missing}, Int64)

In [43]:
missing == missing, missing != missing, missing < missing # missing comparisons produce missing

(missing, missing, missing)

In [50]:
1 == missing, 1 != missing, 1 < missing # the same with values of other types

(missing, missing, missing)

In [51]:
isequal(missing, missing), missing === missing, isequal(1, missing), isless(1, missing) # those produce Bool result

(true, true, false, true)

In [54]:
map(x -> x(missing), [sin, cos, zero, sqrt]) # many (not all) functions handle missing

4-element Array{Missings.Missing,1}:
 missing
 missing
 missing
 missing

In [56]:
map(x -> x(missing, 1), [+, - , *, /, div]) # part 2

5-element Array{Missings.Missing,1}:
 missing
 missing
 missing
 missing
 missing

In [68]:
map(x -> x([1,2,missing]), [minimum, maximum, extrema, mean, any, float]) # part 3

6-element Array{Any,1}:
 missing                                            
 missing                                            
 (missing, missing)                                 
 missing                                            
 missing                                            
 Union{Float64, Missings.Missing}[1.0, 2.0, missing]

In [63]:
collect(skipmissing([1, missing, 2, missing])) # skipmissings returns iterator skipping missing values

2-element Array{Int64,1}:
 1
 2

In [66]:
collect(Missings.replace([1.0, missing, 2.0, missing], NaN)) # the same but replacing missings

4-element Array{Float64,1}:
   1.0
 NaN  
   2.0
 NaN  

In [67]:
unique([1, missing, 2, missing]), levels([1, missing, 2, missing]) # get unique values with or without missings

(Any[1, missing, 2], [1, 2])