# Handling data with python

## 1. Simple Variables

Basic data types:

* Integers
* Floats
* Strings
* Boolean

### 1.1 Numbers

In [None]:
one = 1 # As an integer

print(one)
print('This variable is a', type(one))

In [None]:
one = 1.0 # As a float

print(one)
print('This variable is a', type(one))

### 1.2 Strings

In [None]:
lapis = 'LAPIS'

print(lapis)

In [None]:
# Indexing strings

lapis[1]

In [None]:
print('[0] =>', 'LAPIS'[0])
print('[0:1] =>', 'LAPIS'[0:1])
print('[0:2] =>', 'LAPIS'[0:2])
print('[-1] =>', 'LAPIS'[-1])

In [None]:
# You can use the % operator to format strings

print('This is a float with 3 decimals: %0.3f' % 13.7681209)

In [None]:
# You can use unicode characters:

print('I %s LAPIS' % '\u2665')

In [None]:
# You can see the size of the string
len(lapis)

### 1.3 Booleans

In [None]:
# Booleans are results of a logical expression

print(type(lapis) == str)
print(type(lapis) == float)

In [None]:
# You can also create a boolean variable

a_boolean = True

print(a_boolean)
print(a_boolean is True)
print(a_boolean is False)

### 1.4 Converting between variable types

In [None]:
# You can convert between types

print(float(a_boolean))
print(int(13.67))
print(str(a_boolean))

### 1.5 Operations

In [None]:
# Basic math

two = one + one
zero = one - one
two = 2 * one
four = 2 ** 2
half = 1 / 2

print(two, 'is', one, '+', one)
print(zero, 'is', one, '-', one)
print(two, 'is', 2, '*', one)
print(four, 'is', 2, '**', 2)
print(half, 'is', one, '/', 2)

In [None]:
# Boolean operations are amazing

# Is the variable one equal to 1?
print(one == 1) 

# The expression reads: one is larger or equal to 1 or one/two>0 and a_boolean is True
print((one >= 1) | ((one / two > 0) & (a_boolean == True)))

In [None]:
# Things are different for strings

'A String' + ' Another String'

## 2. Less simple variables

Python allows you to organize variables in lots of ways. Out of the box, the possibilities are:

* Lists
* Tuples
* Dictionaries

### 2.1 Lists and tuples

In [None]:
# The most basic data-types are tuples and lists

a_tuple = ('a', 1, 1.0)
a_list = ['a', 1, 1.0]

print(a_tuple)
print(a_list)

In [None]:
# You can have multidimensional tuples and lists, and even lists of tuples and
# tuples of lists

a_list_of_lists = [[1,0,0],[0,1,0],[0,0,1]]
a_list_of_tuples = [(1,0,0),(0,1,0),(0,0,1)]
a_tuple_of_lists = ([1,0,0],[0,1,0],[0,0,1])
a_tuple_of_tuples = ((1,0,0),(0,1,0),(0,0,1))

print(a_list_of_lists)
print(a_list_of_tuples)
print(a_tuple_of_lists)
print(a_tuple_of_tuples)

In [None]:
# Operations of lists and tuples are equivalent

print('Doubles the tuple:', 2 * a_tuple)
print('Doubles the list:', 2 * a_list)
print('Concatenates tuples:', a_tuple + (2, 1))
print('Concatenates lists:', a_list + [2, 1])

In [None]:
# Indexing lists and tuples is like indexing strings

print(a_list[0])
print(a_list[0:2])
print(a_list[-1])

print(a_tuple[0])
print(a_tuple[0:2])
print(a_tuple[-1])

In [None]:
# But what is the difference?
# Lists are dynamic and tuples are static

# This will work
a_list[0] = 'a new element'
print(a_list)

In [None]:
# This will NOT work
a_tuple[0] = 'a new element'
print(a_list)

In [None]:
# You can even create an empty list and append things to it as you go:

empty_list = []
print('This is an empty list:', empty_list)

In [None]:
empty_list.append('an element')
empty_list.append('another element')

# Now that the list is not empty anymore we should change its name
not_empty = empty_list

print('Not empty anymore:', not_empty)

In [None]:
# You can see the size of the list or tuple just like with strings

len(not_empty)

### 2.2 Dictionaries

In [None]:
# Dictionaries let you organize your data in fields with proper names

a_dictionary = {'one' : 1.0,
                'two' : 2.0,
                'heart' : '\u2665',
                'a_list' : ['this', 'is', 'a', 'list']
                }

# To see what is stored in the dictionary:
print(a_dictionary.keys())

# To get access to the data:
print(a_dictionary['heart'])
print(a_dictionary['a_list'])

### 2.3 Arrays

Arrays are data structures provided by the numpy package, they make computations much faster!

In [None]:
import numpy as np

In [None]:
# You can create arrays of random data sampling from a probability distribution, which is useful
# for examples, let's create this dataset from a gaussian distribution

fake_data = np.random.normal(size=100)

print(fake_data)

In [None]:
# The object np.array has a series of methods

print('Mean:', fake_data.mean())
print('Standard deviation:', fake_data.std())
print('Sum', fake_data.sum())
print('Maximum:', fake_data.max())
print('Index of the maximum value:', fake_data.argmax())
print('Minimum:', fake_data.min())
print('Index of the minimum value:', fake_data.argmin())

In [None]:
# Multidimensional data

more_fake_data = np.zeros((10, 12)) # A matrix full of zeros
more_fake_data

In [None]:
more_fake_data[:,-1] = 2 * np.ones(10) # Setting the last column to a vector of twos
more_fake_data

In [None]:
more_fake_data[0, 0] = np.pi # Setting first element to pi
more_fake_data

In [None]:
# Lots of nice tricks

print('The data has shape', more_fake_data.shape)
print('Sum:', more_fake_data.sum())
print('Sum over x direction:', more_fake_data.sum(axis=1))
print('Sum over y direction:', more_fake_data.sum(axis=0))
print('Square roots: \n', np.sqrt(more_fake_data))
print('Integrals:', np.trapz(more_fake_data, dx=1))

In [None]:
# More useful things from numpy
array_from_a_list = np.array(a_list_of_lists)
print('Array from a list:\n', array_from_a_list)

In [None]:
# Other statistics:
print('Median:', np.median(fake_data))
print('Percentile:', np.percentile(fake_data, 25))

In [None]:
# Arange let's you create series of numbers spaced according to a certain step
aranged_data = np.arange(0, 100, 10)
print(aranged_data)

In [None]:
# Linspace and logspace let you create series of linearly and logarithmicaly spaced data
linear = np.linspace(0, 100)
log = np.logspace(0, 100)

print(linear)
print(log)

In [None]:
# Tip: you can look up the documentation of any python function or object with a "?"
np.sum?

## 3 Some quick coding lessons

### 3.1 For loops

In [None]:
# Python let's you do for loops over anything:

# Keys in a dictionary:
for key in a_dictionary.keys():
    print(key)

In [None]:
# Files in a directiory:

import os

for file in os.listdir('./'):
    print(file)

In [None]:
# And numbers in a range:

for i in range(10):
    i = i**2 + 1 
    print(i)

In [None]:
# List comprehension is the best thing ever

a_cleverly_defined_list = [i**2 + 1 for i in range(10)]
print(a_cleverly_defined_list)

## Question time!

### See if you can get an array where each element is the number of characters on the names of the files in this directory!

### Extra: Do you think you can make a dictionary where the keys are the names of files and the contents are the numbers of characters in the name of each file?

### 3.2 Defining functions

In [None]:
# Functions are defined like this:

def a_function(a_number):
    another_number = a_number**3 + a_number**2
    return another_number

print(a_function(2))

### 3.3 Plotting

In [None]:
import matplotlib.pyplot as plt

# Create random data
x = np.random.normal(size=1000)

# Plot histogram
plt.hist(x)

In [None]:
# Create a fake correlation (12x + 2 line + noise)
y = 12 * (x + np.random.normal(size=1000)) + 2

# Plotting
plt.plot(x, y, '.', label='Data')
plt.plot(x, 12 * x + 2, label=r'$y=12x+2$')

plt.legend()

plt.xlabel('x')
plt.ylabel('y')

In [None]:
# Sometimes you ar dealing with so much data that you can't see any correlation when plotting points

x = np.random.normal(loc=20, scale=3, size=10000)
y = (x + np.random.normal(size=10000, scale=3)) ** 2

plt.plot(x, y, '.')

In [None]:
# You can make points transparent or smaller

plt.plot(x, y, '.', alpha=0.5, markersize=1)

In [None]:
# But there are better ways to do this, like the hexbin function

plt.hexbin(x, y, gridsize=50, mincnt=1)

In [None]:
# Or the kdeplot function of the seaborn package

import seaborn as sns

sns.kdeplot(x, y)

# 4. Taking a look at S-PLUS data

In [None]:
# There are several ways to organize tables on python, I tend to use astropy tables
from astropy.table import Table

catalog = Table.read('splus_laplata.txt', format='ascii')

catalog

In [None]:
# A  more popular option is Pandas

import pandas as pd

catalog = pd.read_table('splus_laplata.txt', delim_whitespace=True)

catalog

Is nice to have at least some proficiency with both Astropy tables and Pandas dataframes. Only learn pytables if these two don't do the trick.

|                | Intuitive | Memory management | Integration with astronomy tools | General purpose |
|----------------|-----------|-------------------|----------------------------------|-----------------|
| Astropy Tables | 10        | 3                 | 10                               | 0               |
| Pandas         | 9         | 7                 | 0                                | 10              |
| PyTables       | 0         | 10         | 0                                | 10              |

### Let's make a color-color diagram

In [None]:
plt.plot(catalog['uJAVA_auto']-catalog['r_auto'], catalog['g_auto']-catalog['i_auto'], '.')

What just happened ???

The S-PLUS magnitudes are reported as 99 or -99 when the object is not detected or not observed in a given band.

In [None]:
no_missing_bands = catalog['nDet_auto'] == 12

no_missing_bands.sum()/len(catalog)

In [None]:
catalog = catalog[no_missing_bands]

plt.plot(catalog['uJAVA_auto']-catalog['r_auto'], catalog['g_auto']-catalog['i_auto'], '.', ms=0.5)

plt.xlabel('u-r')
plt.ylabel('g-i')

In [None]:
# Let's plot galaxies, stars and quasars separately:

stars = catalog['class_2'] == 'STAR'
galaxies = catalog['class_2'] == 'GALAXY'
quasars = catalog['class_2'] == 'QSO'

plt.plot(catalog['uJAVA_auto'][stars]-catalog['r_auto'][stars], catalog['g_auto'][stars]-catalog['i_auto'][stars],
         '.', ms=2, color='green', label='Stars')
plt.plot(catalog['uJAVA_auto'][galaxies]-catalog['r_auto'][galaxies], catalog['g_auto'][galaxies]-catalog['i_auto'][galaxies], 
         '.', ms=2, color='red', label='Galaxies')
plt.plot(catalog['uJAVA_auto'][quasars]-catalog['r_auto'][quasars], catalog['g_auto'][quasars]-catalog['i_auto'][quasars],
         '.', ms=2, color='blue', label='Quasars')

plt.xlabel('u-r')
plt.ylabel('g-i')

plt.legend()

In [None]:
# What a mess! Let's plot contours!

sns.kdeplot(catalog['uJAVA_auto'][stars]-catalog['r_auto'][stars], catalog['g_auto'][stars]-catalog['i_auto'][stars], 
            cmap='Greens', levels=5, label='Stars')
sns.kdeplot(catalog['uJAVA_auto'][galaxies]-catalog['r_auto'][galaxies], catalog['g_auto'][galaxies]-catalog['i_auto'][galaxies], 
            cmap='Reds', levels=5, label='Galaxies')
sns.kdeplot(catalog['uJAVA_auto'][quasars]-catalog['r_auto'][quasars], catalog['g_auto'][quasars]-catalog['i_auto'][quasars], 
            cmap='Blues', levels=5, label='Quasars')

plt.legend()

plt.xlim(-0.2, 4.3)
plt.ylim(-0.2, 2)

plt.xlabel('u-r')
plt.ylabel('g-i')