# Pandas (basics and advanced)

Built on top of numpy.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# building a data frame 
weights = np.random.normal(loc = 130, scale = 50, size =250) # 250 random weights
heights = np.random.normal(loc = 5.5, scale = 1, size = 250)
gender_dict = {1:'female', 0:'male'}
gender = np.random.binomial(1,0.5,250)
gender = [gender_dict[i] for i in gender]
age_dict = {0:'todler', 1:'young', 2:'senior'}
age = np.random.binomial(2, 0.5, 250) # more young less todler and seniors
age = [age_dict[i] for i in age]
data_frame_dict = {'weigh':weights, 'height':heights, 'gender':gender, 'age_group':age}
demographic = pd.DataFrame(data_frame_dict)

In [4]:
demographic.head()

Unnamed: 0,age_group,gender,height,weigh
0,senior,female,3.787129,100.409938
1,senior,male,5.364941,42.671985
2,young,female,4.556363,28.250941
3,young,male,5.478598,184.828446
4,young,female,3.793677,76.052117


# importing data into dataframes
[This](https://www.datacamp.com/community/tutorials/pandas-read-csv) article is a good explanation of some of pandas.read_csv() arguments.

``skiprows``: when you have a csv file that has a few rows above the data that is not part of data, you can choose how many rows to skip. ``skiprows = 4``

``na_values``: when your csv file has some specific labeling for nan values such as 'missing', or'na', you can put these elements into a list and set it as an argument. na_values = `['missing', 'na', '.', 'not available']`

``sep``: you can use read_csv to import tsv or psv, just define sep='\t' or sep='|'. If you dont define these, though, pandas can automatically detect it.

``header``: row number to use as the header label of the data. Header is 0-indexed. You can use skiprows and header to reach the same goal. If you choose ``header=None`` the data frame has columns that has no labels. You need to set labels using ``name``.

``name``: a list of names to be used as header for the dataframe. List should be a set of unique values.

``index_col``: which column is used as the index. If a list is given, a mutliindex is used.

``usecols``: list of columns to keep when importing. You can also use index of columns. This can be done after we import the data too. But if data is too big, we can use this paramter.

``prefix``: if there is no header, the data is imported with 'untitled0','untitled1', ect. If we want to use names other that untitled, we can set it using prefix. For example, if we want col1, col2, col3, ... as our labels, we will use :prefix = 'col'.

``engine``: you can use either 'c' or 'python'. C is faster but it has less functionalities. Use it if you data is very large.

``nrows``: number of rows of files read. Useful for reading pieces of large files.

**``parse_dates``**: you can set the datetime columns of your dataframe using this parameter. This will parse that column and return a datetime object from it. This parameter is very flexible. You can give it a single columns, or few columns that each are timestamps, or concatanataion of few columns. 

``infer_datetime_format``: If the parse_date is enabled, and you set infer_datetime_format = True, the parsing becomes faster.

``chunksize``: you set number of rows of dataset to be read, you can iterate over this iterator and perform your data analysis in a streaming manner. Note that since this will generate an iterator, once you peform the iteration on it, it will yield all of its values and become empty.

In [24]:
cols2keep = [
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'median_house_value']
housing  = pd.read_csv('data/housing.csv', usecols=cols2keep, chunksize=1000)

In [25]:
for i, chunk in enumerate(housing):
    print(i+1,'-th chunk median income:\t$k', round(np.median(chunk.median_income),2))

1 -th chunk median income:	$k 3.51
2 -th chunk median income:	$k 3.5
3 -th chunk median income:	$k 2.49
4 -th chunk median income:	$k 3.46
5 -th chunk median income:	$k 2.46
6 -th chunk median income:	$k 3.41
7 -th chunk median income:	$k 3.72
8 -th chunk median income:	$k 3.13
9 -th chunk median income:	$k 4.15
10 -th chunk median income:	$k 3.53
11 -th chunk median income:	$k 4.68
12 -th chunk median income:	$k 4.1
13 -th chunk median income:	$k 3.07
14 -th chunk median income:	$k 3.3
15 -th chunk median income:	$k 3.26
16 -th chunk median income:	$k 3.8
17 -th chunk median income:	$k 3.63
18 -th chunk median income:	$k 4.38
19 -th chunk median income:	$k 4.51
20 -th chunk median income:	$k 3.06
21 -th chunk median income:	$k 3.56


In [31]:
# now lets import the file into dataframe and start playing with it
col_names = [
 'long',
 'lat',
 'median_age',
 'num_room',
 'num_bedroom',
 'population',
 'households',
 'median_income',
 'median_price',
 'ocean_proximit']
housing = pd.read_csv('data/housing.csv', names=col_names, engine='c', skiprows=1)
housing.head()

Unnamed: 0,long,lat,median_age,num_room,num_bedroom,population,households,median_income,median_price,ocean_proximit
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


#### Slicing data frames

##### slicing based on one column as a series
Using single square brackets.

In [65]:
median_age_series = housing['median_age']
type(median_age_series)

pandas.core.series.Series

##### slicing based on one or few column as dataframe
Using double square brackets.

In [62]:
median_age_df = housing[['median_age']]
median_age_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 1 columns):
median_age    20640 non-null float64
dtypes: float64(1)
memory usage: 161.3 KB


In [59]:
population_income = housing[['median_income', 'population']]
population_income.head()

Unnamed: 0,median_income,population
0,8.3252,322.0
1,8.3014,2401.0
2,7.2574,496.0
3,5.6431,558.0
4,3.8462,565.0


##### slicing based on row index
Use ``iloc``.
``iloc`` is based on the intrinsic ordering of the dataframe. There is also ``loc`` that filters based on the index that has been set.

In [66]:
first_fives = housing.iloc[:5]
first_fives

Unnamed: 0,long,lat,median_age,num_room,num_bedroom,population,households,median_income,median_price,ocean_proximit
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


There is also a very non-standard way of doing this which is as follows: 

In [67]:
first_five_nstandard = housing[:5]
first_five_nstandard.head(1)

Unnamed: 0,long,lat,median_age,num_room,num_bedroom,population,households,median_income,median_price,ocean_proximit
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY


##### slicing based on a condition(filtering)
Use loc, iloc, or simple square bracket.

In [68]:
boolean_mask = (housing.num_room > 2500) & (housing.median_age > 45)
crowded_old = housing[boolean_mask]
crowded_old.describe()

Unnamed: 0,long,lat,median_age,num_room,num_bedroom,population,households,median_income,median_price
count,458.0,458.0,458.0,458.0,454.0,458.0,458.0,458.0,458.0
mean,-120.91714,36.594607,50.59607,3162.244541,683.682819,1494.803493,638.604803,4.298525,317473.120087
std,1.963826,1.873753,2.130467,700.701734,279.877442,599.644147,251.145661,2.24217,131970.699737
min,-124.23,32.69,46.0,2501.0,317.0,500.0,265.0,0.6775,26900.0
25%,-122.43,34.12,49.0,2681.0,516.25,1153.75,495.0,3.01415,224250.0
50%,-122.24,37.76,52.0,2984.0,629.5,1370.5,590.0,3.81695,320950.0
75%,-118.34,37.8,52.0,3450.5,772.75,1650.75,724.0,5.0092,426350.0
max,-115.56,41.32,52.0,10088.0,2747.0,7443.0,2538.0,15.0001,500001.0


In [54]:
crowded_old = housing.loc[boolean_mask]
crowded_old.describe()

Unnamed: 0,long,lat,median_age,num_room,num_bedroom,population,households,median_income,median_price
count,458.0,458.0,458.0,458.0,454.0,458.0,458.0,458.0,458.0
mean,-120.91714,36.594607,50.59607,3162.244541,683.682819,1494.803493,638.604803,4.298525,317473.120087
std,1.963826,1.873753,2.130467,700.701734,279.877442,599.644147,251.145661,2.24217,131970.699737
min,-124.23,32.69,46.0,2501.0,317.0,500.0,265.0,0.6775,26900.0
25%,-122.43,34.12,49.0,2681.0,516.25,1153.75,495.0,3.01415,224250.0
50%,-122.24,37.76,52.0,2984.0,629.5,1370.5,590.0,3.81695,320950.0
75%,-118.34,37.8,52.0,3450.5,772.75,1650.75,724.0,5.0092,426350.0
max,-115.56,41.32,52.0,10088.0,2747.0,7443.0,2538.0,15.0001,500001.0


##### slicing based on a condition for a column
Use loc again.

In [52]:
crowded_old_income = housing.loc[boolean_mask, 'median_income'] # first element is the mask, second is the column
crowded_old_income.describe()

count    814.000000
mean       4.197048
std        2.199041
min        0.677500
25%        2.906625
50%        3.704350
75%        4.774275
max       15.000100
Name: median_income, dtype: float64

### Iterating over dataframe columns, and content

In [69]:
# Iterating over columns
for col in housing: # you just iterate over the dataframe
    print(col)

long
lat
median_age
num_room
num_bedroom
population
households
median_income
median_price
ocean_proximit


In [70]:
# iterating over the content
housing_10 = housing[:10] # first ten rows
for index, row in housing_10.iterrows():
    print("Index: {}\t Population: {}".format(index, row['population']))

Index: 0	 Population: 322.0
Index: 1	 Population: 2401.0
Index: 2	 Population: 496.0
Index: 3	 Population: 558.0
Index: 4	 Population: 565.0
Index: 5	 Population: 413.0
Index: 6	 Population: 1094.0
Index: 7	 Population: 1157.0
Index: 8	 Population: 1206.0
Index: 9	 Population: 1551.0


### Applying a function to all the elements

In [None]:
housing['non_bedroom_rooms'] = housing 