# Pandas - Some Sample Datasets
Most of these came from: https://opendata.socrata.com/browse

Some sample code to read csv files into Pandas DataFrames and "explore."


## Some preliminary stuff to setup

In [None]:
# getting things ready
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# show - displays the Pandas object 
def show(data, show_data = 0):
    print (" Index: {:}".format(data.index))
    print (" Shape: {:}".format(data.shape))
    if show_data:
        print(data.values)

# simple descriptive statistics for the NumPy array object
def simple_ds(data):
    print ("Number of values: {:,d}".format(len(data)))
    print ("           Mean : {:,.3f}".format(data.mean()))
    print ("         Median : {:,.3f}".format(np.median(data)))
    print ("        Std Dev : {:,.3f}".format(data.std()))
    print ("          Range : ({:,.3f}, {:,.3f})".format(data.min(), data.max()))

In [None]:
# Some sample datasets - all from the source above.
# Dictionary - key:actual file name
fnames = {
     "cities"    :"01_U.S._Top_25_Largest_Cities.csv"
    ,"airports"  :"02_Airport_Codes_mapped_to_Latitude_Longitude_in_the_United_States.csv"
    ,"starbucks" :"03_All_Starbucks_Locations_in_the_US_-_Map.csv"
    ,"scpay"     :"04_South_Carolina_State_Employee_Salary_Database.csv"
    ,"songs"     :"05_Top_1_000_Songs_To_Hear_Before_You_Die.csv"
    ,"bestcos"   :"06_Top_5_000_Companies_from_INC.com.csv"
    ,"accounts"  :"07_Unclaimed_bank_accounts.csv"
    ,"wh2012"    :"08_2012_Annual_Report_to_Congress_on_White_House_Staff.csv"
    ,"washconst" :"09_Completed_Construction_Projects_2005_in_Washington_DC.csv"
    ,"usecon"    :"10_us_economic_data.csv"
    ,"matches"   :"11_matches.csv"
}

## Sample Analysis

In [None]:
# Grab a data set using the key and show the first few rows
fkey = "usecon"
df = pd.read_csv("../data/"+fnames[fkey])
show(df)
df.head()

In [None]:
# Extract a numerical column as a NumPy array
some_nums = df['JobsAdded'].values
type(some_nums)

In [None]:
# Have a look at the data
simple_ds(some_nums)
plt.figure(figsize=(10, 8))
plt.hist(some_nums)
plt.show()

In [None]:
# sort the DataFrame
df.sort_values(by="JobsAdded", ascending=False)

In [None]:
df.plot(y="JobsAdded")

In [None]:
df['GDP'].dropna().plot()

In [None]:
df.plot(y="UnemploymentRate")