# Using Pandas

## Pinpoint specific rows and columns in a DataFrame 

In [None]:
import glob
import pandas as pd

dfs = [] 

for csv in sorted(glob.glob('data/*.csv')):
    year = csv[5:9] 
    data = pd.read_csv(csv) 
    data['year'] = year 
    dfs.append(data)

df = pd.concat(dfs, ignore_index=True)

df.head(3)

### Use tail() to look at the end of the DataFrame

In [None]:
df.tail(3)

### Slicing a DataFrame

In [None]:
df[50:60] #look at rows 50 to 59

### Look at specific columns

In [None]:
df['year'] #look at the year column

In [None]:
print(f"first row: {df['year'][0]}") #use double quotes around your fstring if it contains single quotes
print('rows 100 to 102:') #add a new print statement to create a new line
print(df['year'][100:103])

In [None]:
type(df['year'])

## Summary statistics on columns

In [None]:
print(f"max year: {df['year'].max()}")
print(f"min year: {df['year'].min()}")

### Summarize columns that hold string objects

In [None]:
df['branch']

In [None]:
print(f"Number of unique branches: {df['branch'].nunique()}")
print(df['branch'].unique())

## Use `.groupby()` to analyze subsets of data

In [None]:
df.groupby('branch')['ytd'].sum()

### Sort pandas series using `.sort_values()`

In [None]:
circ_by_branch = df.groupby('branch')['ytd'].sum()
circ_by_branch.sort_values(ascending=False).head(10)

In [None]:
circ_by_year_branch = df.groupby(['year', 'branch'])['ytd'].sum().sort_values(ascending=False)
circ_by_year_branch.head(5)

## Use `.iloc[]` and `.loc[]` to select DataFrame locations.

In [None]:
# print values in the 1st and 2nd to last columns in the first row
# '\n' prints a linebreak
print(f"Branch: {df.iloc[0,0]} \nYTD circ: {df.iloc[0,-2]}")

In [None]:
# print the same values as above, using the column names
print(f"Branch: {df.loc[0,'branch']} \nYTD circ: {df.loc[0, 'ytd']}")

## Save DataFrames

In [None]:
circ_df = circ_by_year_branch.to_frame()
circ_df.head(5)

### Save to CSV

In [None]:
circ_df.to_csv('high_usage.csv')