In [23]:
## Familiarizing ourselves with DataFrames and Series
import pandas as pd

# Dataframes are basically mappings of names to datapoints:
data = {'column1': [1,2,3,4,5], 'column2': ['a','b','c','d','e']}
df = pd.DataFrame(data)

## Notice how pandas added an index, which is just the row number starting from 0. We can explicitly set this if we want
df.index = [11,12,13,14,15]

# Series are single "columns" of data. You can make a series by declaring it:
series = pd.Series([1,2,3,4,5])

# Or by taking a column from a dataframe
series = df['column1']
## Notice how the index of the series matches the index of the dataframe in this case!!!

## Try making a data frame yourself, and extracting a column from it. 

In [None]:
## How to slice and dice data (accessing)

# accessing rows by index (use .loc):
index = 81174
print loansData.loc[index]
# accessing columns (by name):
loansData['Loan.Purpose']

# accessing by location:
row_number1 = 0
column_number1 = 0
row_number2 = 5
column_number2 = 2

print loansData.iloc[row_number1, column_number1]
# accessing ranges by location:
print loansData.iloc[row_number1:row_number2, column_number1:column_number2]


In [24]:
## How to summarize data
loansData = pd.read_csv('https://spark-public.s3.amazonaws.com/dataanalysis/loansData.csv')


print loansData['Amount.Requested'].mean()
print loansData['Loan.Length'].value_counts()
print pd.crosstab(loansData['Loan.Purpose'], loansData['Loan.Length'])

## It is very common that we will need to perform some aggregation on a subgroup of data.
## Pandas supplies the groupby() method to make this super easy
## For instance, we can take the mean Amount.Request by Loan.Purpose
loansData['Amount.Requested'].groupby(loansData['Loan.Purpose']).mean()

# Think about this in steps: First we slice the Amount.Requested, then we group it, then we take the mean of those groups

12406.5
36 months    1952
60 months     548
dtype: int64
Loan.Length         36 months  60 months
Loan.Purpose                            
car                        33         17
credit_card               382         62
debt_consolidation        988        319
educational                14          1
home_improvement          106         46
house                      14          6
major_purchase             83         18
medical                    26          4
moving                     24          5
other                     165         36
renewable_energy            4          0
small_business             62         25
vacation                   17          4
wedding                    34          5


Loan.Purpose
car                    8167.500000
credit_card           12251.520270
debt_consolidation    13832.096404
educational            5305.000000
home_improvement      13029.605263
house                 12888.750000
major_purchase         8187.128713
medical                8710.000000
moving                 7672.413793
other                  8540.298507
renewable_energy       4525.000000
small_business        13137.931034
vacation               7314.285714
wedding               11020.512821
Name: Amount.Requested, dtype: float64

In [31]:
## How to slice and dice data (accessing)

## How to modify data in a pandas data object

# creating a new object: 
interest_rate = [x[:-1] for x in loansData['Interest.Rate']]

# Modifying data inplace:
loansData['Interest.Rate'] = loansData['Interest.Rate'].apply(lambda x: x[:-1]) 

loansData.head()

Unnamed: 0,Amount.Requested,Amount.Funded.By.Investors,Interest.Rate,Loan.Length,Loan.Purpose,Debt.To.Income.Ratio,State,Home.Ownership,Monthly.Income,FICO.Range,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months,Employment.Length
81174,20000,20000,8.9,36 months,debt_consolidation,14.90%,SC,MORTGAGE,6541.67,735-739,14,14272,2,< 1 year
99592,19200,19200,12.12,36 months,debt_consolidation,28.36%,TX,MORTGAGE,4583.33,715-719,12,11140,1,2 years
80059,35000,35000,21.98,60 months,debt_consolidation,23.81%,CA,MORTGAGE,11500.0,690-694,14,21977,1,2 years
15825,10000,9975,9.99,36 months,debt_consolidation,14.30%,KS,MORTGAGE,3833.33,695-699,10,9346,0,5 years
33182,12000,12000,11.71,36 months,credit_card,18.78%,NJ,RENT,3195.0,695-699,11,14469,0,9 years


In [None]:
### EXCERCISE:

# What is the average Interest Rate by State?

# First strip the % sign (better to do it inplace)
# Then make it a number
# Then use the group by logic