# Pandas Data Structures

In [6]:
import pandas as pd
import numpy as np

## Pandas Series

### Creating a Simple Series

In [5]:
s = pd.Series([3, -5, 7, 4], index = ['a', 'b', 'c', 'd'])
s

a    3
b   -5
c    7
d    4
dtype: int64

### Creating a Series Using NumPy Array

In [8]:
l01 = np.random.randint(-10, 11, size = 8)
l01

array([-1,  5,  6, -2,  1,  0, -1, -6])

In [9]:
s01 = pd.Series(l01, index = ['01', '02', '03', '04', '05', '06', '07', '08'])
s01

01   -1
02    5
03    6
04   -2
05    1
06    0
07   -1
08   -6
dtype: int32

### Creating a Series Using a Dictionary

In [23]:
s02 = pd.Series({'Mon': True, 'Tue': False, 'Wed': False, 'Thu': True, 'Fri': True})
s02

Mon     True
Tue    False
Wed    False
Thu     True
Fri     True
dtype: bool

### Creating a Series Using a Scalar

In [25]:
s03 = pd.Series(120, index = np.arange(1,11))
s03

1     120
2     120
3     120
4     120
5     120
6     120
7     120
8     120
9     120
10    120
dtype: int64

## Pandas DataFrame

In [26]:
data = {'Country': ['Belgium', 'India', 'Brazil'],
        'Capital': ['Brussels', 'New Delhi', 'Brasília'],
        'Population': [11190846, 1303171035, 207847528]}

df = pd.DataFrame(data, columns=['Country', 'Capital', 'Population'])

df

Unnamed: 0,Country,Capital,Population
0,Belgium,Brussels,11190846
1,India,New Delhi,1303171035
2,Brazil,Brasília,207847528


# Pandas File I/O

## Reading a 'csv' File Using Pandas

In [28]:
df_csv = pd.read_csv('car_financing.csv')
df_csv

Unnamed: 0,Month,Starting Balance,Repayment,Interest Paid,Principal Paid,New Balance,term,interest_rate,car_type
0,1,34689.96,687.23,202.93,484.30,34205.66,60,0.0702,Toyota Sienna
1,2,34205.66,687.23,200.10,487.13,33718.53,60,0.0702,Toyota Sienna
2,3,33718.53,687.23,197.25,489.98,33228.55,60,0.0702,Toyota Sienna
3,4,33228.55,687.23,194.38,492.85,32735.70,60,0.0702,Toyota Sienna
4,5,32735.70,687.23,191.50,495.73,32239.97,60,0.0702,Toyota Sienna
...,...,...,...,...,...,...,...,...,...
403,56,3951.11,796.01,9.54,786.47,3164.64,60,0.0290,VW Golf R
404,57,3164.64,796.01,7.64,788.37,2376.27,60,0.0290,VW Golf R
405,58,2376.27,796.01,5.74,790.27,1586.00,60,0.0290,VW Golf R
406,59,1586.00,796.01,3.83,792.18,793.82,60,0.0290,VW Golf R


## Reading an 'excel' File Using Pandas

In [30]:
df_excel = pd.read_excel('car_financing.xlsx')
df_excel

Unnamed: 0,Month,Starting Balance,Repayment,Interest Paid,Principal Paid,New Balance,term,interest_rate,car_type
0,1,34689.96,687.23,202.93,484.30,34205.66,60,0.0702,Toyota Sienna
1,2,34205.66,687.23,200.10,487.13,33718.53,60,0.0702,Toyota Sienna
2,3,33718.53,687.23,197.25,489.98,33228.55,60,0.0702,Toyota Sienna
3,4,33228.55,687.23,194.38,492.85,32735.70,60,0.0702,Toyota Sienna
4,5,32735.70,687.23,191.50,495.73,32239.97,60,0.0702,Toyota Sienna
...,...,...,...,...,...,...,...,...,...
403,56,3951.11,796.01,9.54,786.47,3164.64,60,0.0290,VW Golf R
404,57,3164.64,796.01,7.64,788.37,2376.27,60,0.0290,VW Golf R
405,58,2376.27,796.01,5.74,790.27,1586.00,60,0.0290,VW Golf R
406,59,1586.00,796.01,3.83,792.18,793.82,60,0.0290,VW Golf R


# Pandas Information

In [31]:
df_csv.shape

(408, 9)

In [32]:
df_csv.index

RangeIndex(start=0, stop=408, step=1)

In [33]:
df_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 408 entries, 0 to 407
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Month             408 non-null    int64  
 1   Starting Balance  408 non-null    float64
 2   Repayment         408 non-null    float64
 3   Interest Paid     408 non-null    float64
 4   Principal Paid    408 non-null    float64
 5   New Balance       408 non-null    float64
 6   term              408 non-null    int64  
 7   interest_rate     408 non-null    float64
 8   car_type          408 non-null    object 
dtypes: float64(6), int64(2), object(1)
memory usage: 28.8+ KB


In [35]:
df_csv.head()

Unnamed: 0,Month,Starting Balance,Repayment,Interest Paid,Principal Paid,New Balance,term,interest_rate,car_type
0,1,34689.96,687.23,202.93,484.3,34205.66,60,0.0702,Toyota Sienna
1,2,34205.66,687.23,200.1,487.13,33718.53,60,0.0702,Toyota Sienna
2,3,33718.53,687.23,197.25,489.98,33228.55,60,0.0702,Toyota Sienna
3,4,33228.55,687.23,194.38,492.85,32735.7,60,0.0702,Toyota Sienna
4,5,32735.7,687.23,191.5,495.73,32239.97,60,0.0702,Toyota Sienna


In [36]:
df_csv.columns

Index(['Month', 'Starting Balance', 'Repayment', 'Interest Paid',
       'Principal Paid', 'New Balance', 'term', 'interest_rate', 'car_type'],
      dtype='object')

In [37]:
df_csv.count()

Month               408
Starting Balance    408
Repayment           408
Interest Paid       408
Principal Paid      408
New Balance         408
term                408
interest_rate       408
car_type            408
dtype: int64

# Subsetting, Slicing, and Boolean Indexing

In [38]:
s

a    3
b   -5
c    7
d    4
dtype: int64

In [39]:
df

Unnamed: 0,Country,Capital,Population
0,Belgium,Brussels,11190846
1,India,New Delhi,1303171035
2,Brazil,Brasília,207847528


## Pandas Subsetting & Slicing

In [40]:
s['b']

-5

In [42]:
s['a'] = 6
s

a    6
b   -5
c    7
d    4
dtype: int64

In [43]:
df[1:]

Unnamed: 0,Country,Capital,Population
1,India,New Delhi,1303171035
2,Brazil,Brasília,207847528


In [44]:
df.loc[[0], ['Country']]

Unnamed: 0,Country
0,Belgium


In [46]:
df.iloc[[0],[0]]

Unnamed: 0,Country
0,Belgium


## Boolean Indexing

In [49]:
df[df['Population'] > 120000000]

Unnamed: 0,Country,Capital,Population
1,India,New Delhi,1303171035
2,Brazil,Brasília,207847528


# Pandas Statistics (Summary)

In [70]:
df.describe()

Unnamed: 0,Population
count,3.0
mean,507403100.0
std,696134600.0
min,11190850.0
25%,109519200.0
50%,207847500.0
75%,755509300.0
max,1303171000.0


In [52]:
df

Unnamed: 0,Country,Capital,Population
0,Belgium,Brussels,11190846
1,India,New Delhi,1303171035
2,Brazil,Brasília,207847528


In [61]:
df_temp = df.loc[:,['Population']]
df_temp

Unnamed: 0,Population
0,11190846
1,1303171035
2,207847528


In [62]:
df_temp.min()

Population    11190846
dtype: int64

In [64]:
df_temp.max()

Population    1303171035
dtype: int64

In [66]:
df_temp.sum()

Population    1522209409
dtype: int64

In [68]:
df_temp.mean()

Population    5.074031e+08
dtype: float64

In [71]:
df_temp.median()

Population    207847528.0
dtype: float64

In [72]:
df_temp.count()

Population    3
dtype: int64