# Data

In [None]:
using BenchmarkTools
using DataFrames
using DelimitedFiles
using CSV
using XLSX
using Downloads

In [None]:
P = Downloads.download("https://raw.githubusercontent.com/nassarhuda/easy_data/master/programming_languages.csv",
    "programming_languages.csv")

In [None]:
P,H = readdlm("programming_languages.csv",',';header=true);

In [None]:
H

In [None]:
#read CSV Files
C = CSV.read("programming_languages.csv", DataFrame);

In [None]:
@show typeof(C)
C[1:10,:]
#C[!,:year]

In [None]:
@show typeof(P)
P[1:10,:]

In [None]:
names(C)

In [None]:
names(C)
C.year
C.language
describe(C)

In [None]:
#Benchmark Tools

@btime P,H = readdlm("programming_languages.csv",',';header=true);
@btime C = CSV.read("programming_languages.csv", DataFrame);

In [None]:
CSV.write("programminglanguages_CSV.csv", DataFrame(P, :auto))

In [None]:
#read XLSX files

# j = XLSX.readdata('File Name.xlsx','Sheet Name','Cell Range')

T = XLSX.readdata("data/zillow_data_download_april2020.xlsx","Sale_counts_city","A1:F9")

In [None]:
#read Table on XLSX

G = XLSX.readtable("data/zillow_data_download_april2020.xlsx","Sale_counts_city");

In [None]:
G[1]

In [None]:
G[1][1][1:10]

In [None]:
G[2][1:10]

In [None]:
D = DataFrame(G...) # equivalent to DataFrame(G[1],G[2])

In [None]:
foods = ["apple", "cucumber", "tomato", "banana"]
calories = [105,47,22,105]
prices = [0.85,1.6,0.8,0.6,]
dataframe_calories = DataFrame(item=foods,calories=calories)
dataframe_prices = DataFrame(item=foods,price=prices)

In [None]:
DF = innerjoin(dataframe_calories,dataframe_prices,on=:item)

In [None]:
DataFrame(T, :auto)

In [None]:
# if you already have a dataframe: 
# XLSX.writetable("filename.xlsx", collect(DataFrames.eachcol(df)), DataFrames.names(df))
XLSX.writetable("writefile_using_XLSX.xlsx",G[1],G[2])

In [None]:
using JLD
jld_data = JLD.load("data/mytempdata.jld")
save("mywrite.jld", "A", jld_data)

In [None]:
#read NPZ data

using NPZ
npz_data = npzread("data/mytempdata.npz")
npzwrite("mywrite.npz", npz_data)

In [None]:
#read MATLAB Data

using MAT
Matlab_data = matread("data/mytempdata.mat")
matwrite("mywrite.mat",Matlab_data)

In [None]:
#show all Files

@show typeof(jld_data)
@show typeof(npz_data)
@show typeof(Matlab_data)
;

In [None]:
Matlab_data

In [None]:
P

In [None]:
# Q1: Which year was was a given language invented?
function year_created(P,language::String)
    loc = findfirst(P[:,2] .== language)
    return P[loc,1]
end
year_created(P,"Julia")

In [None]:
year_created(P,"W") 

In [None]:
function year_created_handle_error(P,language::String)
    loc = findfirst(P[:,2] .== language)
    !isnothing(loc) && return P[loc,1]
    error("Error: Language not found.")
end
year_created_handle_error(P,"W")

In [None]:
# Q2: How many languages were created in a given year?

function how_many_per_year(P,year::Int64)
    year_count = length(findall(P[:,1].==year))
    return year_count
end
how_many_per_year(P,2011)

In [None]:
P_df = C #DataFrame(year = P[:,1], language = P[:,2]) # or DataFrame(P)

In [None]:
# Q1: Which year was was a given language invented?
# it's a little more intuitive and you don't need to remember the column ids
function year_created(P_df,language::String)
    loc = findfirst(P_df.language .== language)
    return P_df.year[loc]
end
year_created(P_df,"Julia")

In [None]:
year_created(P_df,"W")

In [None]:
function year_created_handle_error(P_df,language::String)
    loc = findfirst(P_df.language .== language)
    !isnothing(loc) && return P_df.year[loc]
    error("Error: Language not found.")
end
year_created_handle_error(P_df,"W")

In [None]:
# Q2: How many languages were created in a given year?
function how_many_per_year(P_df,year::Int64)
    year_count = length(findall(P_df.year.==year))
    return year_count
end
how_many_per_year(P_df,2011)

In [None]:
# A quick example to show how to build a dictionary
Dict([("A", 1), ("B", 2),(1,[1,2])])

In [None]:
P_dictionary = Dict{Integer,Vector{String}}()

In [None]:
#P_dictionary[67] = ["julia","programming"]

In [None]:
#this is not gonna work.
#P_dictionary["julia"] = 7

In [None]:
dict = Dict{Integer,Vector{String}}()
for i = 1:size(P,1)
    year,lang = P[i,:]
    if year in keys(dict)
        dict[year] = push!(dict[year],lang) 
        # note that push! is not our favorite thing to do in Julia, 
        # but we're focusing on correctness rather than speed here
    else
        dict[year] = [lang]
    end
end

In [None]:

# Though a smarter way to do this is:
curyear = P_df.year[1]
P_dictionary[curyear] = [P_df.language[1]]
for (i,nextyear) in enumerate(P_df.year[2:end])
    if nextyear == curyear
        #same key
        P_dictionary[curyear] = push!(P_dictionary[curyear],P_df.language[i+1])
        # note that push! is not our favorite thing to do in Julia, 
        # but we're focusing on correctness rather than speed here
    else
        curyear = nextyear
        P_dictionary[curyear] = [P_df.language[i+1]]
    end
end

In [None]:
length(keys(P_dictionary))

In [None]:
length(unique(P[:,1]))

In [None]:

# Q1: Which year was was a given language invented?
# now instead of looking in one long vector, we will look in many small vectors
function year_created(P_dictionary,language::String)
    keys_vec = collect(keys(P_dictionary))
    lookup = map(keyid -> findfirst(P_dictionary[keyid].==language),keys_vec)
    # now the lookup vector has `nothing` or a numeric value. We want to find the index of the numeric value.
    return keys_vec[findfirst((!isnothing).(lookup))]
end
year_created(P_dictionary,"Julia")

In [None]:
# Q2: How many languages were created in a given year?
how_many_per_year(P_dictionary,year::Int64) = length(P_dictionary[year])
how_many_per_year(P_dictionary,2011)

In [None]:
# assume there were missing values in our dataframe
P[1,1] = missing
P_df = DataFrame(year = P[:,1], language = P[:,2])

In [None]:
dropmissing(P_df)

# Finally...
### After finishing this notebook, you should be able to:

##### [ ] dowload a data file from the web given a url
##### [ ] load data from a file from a text file via DelimitedFiles or CSV
##### [ ] write your data to a text file or csv file
##### [ ] load data from file types xlsx, jld, npz, mat, rda
##### [ ] write your data to an xlsx file, jld, npz, mat, rda
##### [ ] store data in a 2D array (Matrix), or DataFrame or Dict
##### [ ] write functions to perform basic lookups on Matrix, DataFrame, and Dict types
##### [ ] use some of the basic functions on DataFrames such as: dropmissing, describe, by, and join

# Lineer Cebir

A lot of the Data Science methods we will see in this tutorial require some understanding of linear algebra, and in this notebook we will focus on how Julia handles matrices, the types that exist, and how to call basic linear algebra tasks.

/////////////////////////////

Bu öğreticide göreceğimiz Veri Bilimi yöntemlerinin çoğu, lineer cebir hakkında biraz bilgi sahibi olmayı gerektirir ve bu not defterinde Julia'nın matrisleri nasıl ele aldığına, var olan türlere ve temel lineer cebir görevlerinin nasıl çağrılacağına odaklanacağız.

In [None]:
using LinearAlgebra
using SparseArrays
using Images
using MAT

creating a random matrix.

In [None]:
A = rand(10,10);
Atranspose = A'
A = A*Atranspose;

In [None]:
@show A[11] == A[1,2];

In [None]:
b = rand(10);
x = A\b;
@show norm(A*x-b)
;

In [None]:
@show typeof(A)
@show typeof(b)
@show typeof(rand(1,10))
@show typeof(Atranspose)
;

In [None]:
Matrix{Float64} == Array{Float64,2}

In [None]:
Vector{Float64} == Array{Float64,1}

In [None]:
Atranspose

In [None]:
#?adjoint

In [None]:
luA = lu(A)

In [None]:
qrA = qr(A)

In [None]:
norm(qrA.Q*qrA.R - A)

In [None]:
isposdef(A)

In [None]:
cholA = cholesky(A)

In [None]:
norm(cholA.L*cholA.U - A)

In [None]:
cholA.L

In [None]:
cholA.U

In [None]:
factorize(A)

In [None]:
#?factorize

In [None]:
Diagonal([1,2,3])

In [None]:
I(3)

In [None]:
using SparseArrays
S = sprand(5,5,2/5)

In [None]:
S.rowval

In [None]:
Matrix(S)

In [None]:
S.colptr

In [None]:
S.m

In [None]:
X1 = load("data/khiam-small.jpg")

In [None]:
@show typeof(X1)
X1[1,1] # this is pixel [1,1]

In [None]:
Xgray = Gray.(X1)

In [None]:
R = map(i->X1[i].r,1:length(X1))
R = Float64.(reshape(R,size(X1)...))

G = map(i->X1[i].g,1:length(X1))
G = Float64.(reshape(G,size(X1)...))

B = map(i->X1[i].b,1:length(X1))
B = Float64.(reshape(B,size(X1)...))
;

In [None]:
Z = zeros(size(R)...) # just a matrix of all zeros of equal size as the image
RGB.(Z,G,Z)

In [None]:
Xgrayvalues = Float64.(Xgray)

In [None]:
SVD_V = svd(Xgrayvalues)

In [None]:
norm(SVD_V.U*diagm(SVD_V.S)*SVD_V.V' - Xgrayvalues)

In [None]:
# use the top 4 singular vectors/values to form a new image
u1 = SVD_V.U[:,1]
v1 = SVD_V.V[:,1]
img1 = SVD_V.S[1]*u1*v1'

i = 2
u1 = SVD_V.U[:,i]
v1 = SVD_V.V[:,i]
img1 += SVD_V.S[i]*u1*v1'

i = 3
u1 = SVD_V.U[:,i]
v1 = SVD_V.V[:,i]
img1 += SVD_V.S[i]*u1*v1'

i = 4
u1 = SVD_V.U[:,i]
v1 = SVD_V.V[:,i]
img1 += SVD_V.S[i]*u1*v1'

In [None]:
Gray.(img1)

In [None]:
i = 1:100
u1 = SVD_V.U[:,i]
v1 = SVD_V.V[:,i]
img1 = u1*spdiagm(0=>SVD_V.S[i])*v1'
Gray.(img1)

In [None]:
norm(Xgrayvalues-img1)

In [None]:
M = matread("data/face_recog_qr.mat")

In [None]:
q = reshape(M["V2"][:,1],192,168)
Gray.(q)

In [None]:
b = q[:]

In [None]:
A = M["V2"][:,2:end]
x = A\b #Ax=b
Gray.(reshape(A*x,192,168))

In [None]:
norm(A*x-b)

In [None]:
qv = q+rand(size(q,1),size(q,2))*0.5
qv = qv./maximum(qv)
Gray.(qv)

In [None]:
b = qv[:];

In [None]:
x = A\b
norm(A*x-b)

In [None]:
Gray.(reshape(A*x,192,168))

# Finally...
### After finishing this notebook, you should be able to:

##### [ ] reshape and vectorize a matrix
##### [ ] apply basic linear algebra operations such as transpose, matrix-matrix product, and solve a linear systerm
##### [ ] call a linear algebra factorization on your matrix
##### [ ] use SVD to created a compressed version of an image
##### [ ] solve the face recognition problem via a least square approach
##### [ ] create a sparse matrix, and call the components of the Compressed Sparse Column storage
##### [ ] list a few types of matrices Julia uses (diagonal, upper triangular,...)
##### [ ] (unrelated to linear algebra): load an image, convert it to grayscale, and extract the RGB layers