# Introduction to DataFrames
**Bogumił Kamiński, 2017**

In [48]:
using DataFrames # load package

## Constructors

In [49]:
DataFrame() # empty DataFrame

In [50]:
DataFrame(A=1:3, B=rand(3), C=randstring.([3,3,3])) # keyword arguments

Unnamed: 0,A,B,C
1,1,0.434013,Bna
2,2,0.581531,II6
3,3,0.323388,nxA


In [51]:
x = Dict("A" => [1,2], "B" => [true, false], "C" => ['a', 'b'])
DataFrame(x) # from dictionary, columns will be sorted

Unnamed: 0,A,B,C
1,1,True,'a'
2,2,False,'b'


In [52]:
DataFrame(:A => [1,2], :B => [true, false], :C => ['a', 'b']) # from pairs

Unnamed: 0,A,B,C
1,1,True,'a'
2,2,False,'b'


In [53]:
DataFrame([rand(3) for i in 1:3]) # from vector of vectors

Unnamed: 0,x1,x2,x3
1,0.857284,0.288135,0.776641
2,0.219949,0.780355,0.381579
3,0.631626,0.972849,0.69764


In [54]:
DataFrame(rand(3)) # edge case vector of atoms

Unnamed: 0,x1,x2,x3
1,0.882897,0.326831,0.863442


In [55]:
DataFrame(rand(3), [:A, :B, :C]) # pass second argument to give column names

Unnamed: 0,A,B,C
1,0.591222,0.239545,0.161103


In [56]:
DataFrame(rand(3,4)) # from matrix

Unnamed: 0,x1,x2,x3,x4
1,0.150824,0.167819,0.433765,0.970116
2,0.914093,0.448354,0.509072,0.669076
3,0.240055,0.574891,0.268122,0.699241


In [57]:
DataFrame([Int, Float64, Any], [:A, :B, :C], 1) # pass column types, names and number of rows
# we get missing because Any >: Missing

Unnamed: 0,A,B,C
1,110625264,5.46561e-316,missing


In [58]:
DataFrame([Int, Float64, String], [:A, :B, :C], 1) # it was created OK, only value for String is #undef

UndefRefError: [91mUndefRefError: access to undefined reference[39m

In [59]:
DataFrame([Int, Float64, String], [:A, :B, :C], 0) # columns are created, but there are no rows

Unnamed: 0,A,B,C


In [60]:
DataFrame(Int, 3, 5) # a quick way to create homogenous DataFrame

Unnamed: 0,x1,x2,x3,x4,x5
1,110625264,110625264,178772168,439436208,111804272
2,0,0,178121912,439436272,138878000
3,110641176,110641176,179724552,439436336,110626704


In [61]:
DataFrame([Int, Float64], 4) # similar, but with nonhomogenous columns

Unnamed: 0,x1,x2
1,439442864,6.36269e-316
2,110632080,5.46595e-316
3,439442960,5.46569e-316
4,110625264,5.46591e-316


In [62]:
x = DataFrame(A = [1, 2], B = [1.0, missing], C = ["a", "b"], D = [1, "a"])
convert(Array, x) # convert DataFrame to Matrix

2×4 Array{Any,2}:
 1  1.0       "a"  1   
 2   missing  "b"   "a"

## Getting basic information about a data frame

In [63]:
x = DataFrame(A = [1, 2], B = [1.0, missing], C = ["a", "b"], D = [1, "a"])

Unnamed: 0,A,B,C,D
1,1,1.0,a,1
2,2,missing,b,a


In [64]:
size(x), size(x, 1), size(x, 2)

((2, 4), 2, 4)

In [65]:
nrow(x), ncol(x), length(x)

(2, 4, 4)

In [66]:
describe(x)

A
Summary Stats:
Mean:           1.500000
Minimum:        1.000000
1st Quartile:   1.250000
Median:         1.500000
3rd Quartile:   1.750000
Maximum:        2.000000
Length:         2
Type:           Int64

B
Summary Stats:
Mean:           1.000000
Minimum:        1.000000
1st Quartile:   1.000000
Median:         1.000000
3rd Quartile:   1.000000
Maximum:        1.000000
Length:         2
Type:           Union{Float64, Missings.Missing}
Number Missing: 1
% Missing:      50.000000

C
Summary Stats:
Length:         2
Type:           String
Number Unique:  2

D
Summary Stats:
Length:         2
Type:           Any
Number Unique:  2
Number Missing: 0
% Missing:      0.000000



In [67]:
showcols(x)

2×4 DataFrames.DataFrame
│ Col # │ Name │ Eltype                           │ Missing │ Values          │
├───────┼──────┼──────────────────────────────────┼─────────┼─────────────────┤
│ 1     │ A    │ Int64                            │ 0       │ 1  …  2         │
│ 2     │ B    │ Union{Float64, Missings.Missing} │ 1       │ 1.0  …  missing │
│ 3     │ C    │ String                           │ 0       │ a  …  b         │
│ 4     │ D    │ Any                              │ 0       │ 1  …  a         │

In [68]:
names(x)

4-element Array{Symbol,1}:
 :A
 :B
 :C
 :D

In [69]:
eltypes(x)

4-element Array{Type,1}:
 Int64                           
 Union{Float64, Missings.Missing}
 String                          
 Any                             

In [70]:
y = DataFrame(Int, 100, 10)

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10
1,110641192,466319824,110641192,110641168,110641192,466986288,110641192,110641192,110641168,110641192
2,110641192,466319824,110641192,463421072,110641192,466986288,110641192,110641192,473394416,110641192
3,110641192,466320176,110641192,140141120,110641192,466986640,110641192,110641192,148243104,110641192
4,469982416,466320352,110641192,463421104,473559808,466986816,110641192,470758224,473394448,477538640
5,110641192,466320528,110641192,463421168,110641192,466986992,110641192,110641192,473394512,110641192
6,469984176,466320704,110641192,463421200,473559888,466987168,110641192,470760144,473394544,477538720
7,110641192,466321056,110641192,140141136,110641192,466987520,110641192,110641192,148243120,110641192
8,110641192,466321232,110641192,140141152,110641192,466987696,110641192,110641192,148243136,110641192
9,110641192,466320880,110641192,463421328,110641192,466987344,110641192,110641192,473394640,110641192
10,110641192,466911424,110641192,140141168,110641192,466988048,110641192,110641192,148243152,110641192


In [71]:
head(y)

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10
1,110641192,466319824,110641192,110641168,110641192,466986288,110641192,110641192,110641168,110641192
2,110641192,466319824,110641192,463421072,110641192,466986288,110641192,110641192,473394416,110641192
3,110641192,466320176,110641192,140141120,110641192,466986640,110641192,110641192,148243104,110641192
4,469982416,466320352,110641192,463421104,473559808,466986816,110641192,470758224,473394448,477538640
5,110641192,466320528,110641192,463421168,110641192,466986992,110641192,110641192,473394512,110641192
6,469984176,466320704,110641192,463421200,473559888,466987168,110641192,470760144,473394544,477538720


In [72]:
tail(y)

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10
1,473356960,466984704,110641192,140142544,475198928,474104352,110641192,473896192,148244352,477541520
2,110641192,466985408,110641192,463587728,110641192,474104880,110641192,110641192,473397200,110641192
3,473357360,466985408,110641192,463587728,475199168,474104880,110641192,473896432,473397200,477541920
4,0,471589872,471589872,0,100,9,442280784,442280784,442280784,179790488
5,0,471589904,471589904,0,100,9,442280400,442280400,442280400,179790536
6,0,471589936,471589936,0,100,9,443180880,443180880,443180880,179790584
