<a href="https://colab.research.google.com/github/aekanun2020/Data-Analytics-using-Python/blob/main/Python_Pandas_Dataframe_Basic_Properties_and_Methods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Basic Properties**

In [1]:
# Create DataFrame with None/Null to work with examples
import pandas as pd
import numpy as np
technologies   = ({
    'Courses':["Spark","PySpark","Hadoop","Python","Pandas",None,"Spark","Python"],
    'Fee' :[22000,25000,23000,24000,np.nan,25000,25000,22000],
    'Duration':['30day','50days','55days','40days','60days','35day','','50days'],
    'Discount':[1000,2300,1000,1200,2500,1300,1400,1600]
          })
row_labels=['r0','r1','r2','r3','r4','r5','r6','r7']
df = pd.DataFrame(technologies, index=row_labels)
print(df)

    Courses      Fee Duration  Discount
r0    Spark  22000.0    30day      1000
r1  PySpark  25000.0   50days      2300
r2   Hadoop  23000.0   55days      1000
r3   Python  24000.0   40days      1200
r4   Pandas      NaN   60days      2500
r5     None  25000.0    35day      1300
r6    Spark  25000.0               1400
r7   Python  22000.0   50days      1600


In [2]:
df.shape

(8, 4)

In [3]:
df.size

32

In [4]:
df.empty

False

In [5]:
df.columns

Index(['Courses', 'Fee', 'Duration', 'Discount'], dtype='object')

In [6]:
df.columns.values

array(['Courses', 'Fee', 'Duration', 'Discount'], dtype=object)

In [7]:
df.index

Index(['r0', 'r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7'], dtype='object')

In [8]:
df.index.values

array(['r0', 'r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7'], dtype=object)

In [9]:
df.dtypes

Courses      object
Fee         float64
Duration     object
Discount      int64
dtype: object

**Basic Methods of Columns Filtering**

In [10]:
df['Fee'] # return Series

r0    22000.0
r1    25000.0
r2    23000.0
r3    24000.0
r4        NaN
r5    25000.0
r6    25000.0
r7    22000.0
Name: Fee, dtype: float64

In [11]:
df[['Fee']] # return Dataframe

Unnamed: 0,Fee
r0,22000.0
r1,25000.0
r2,23000.0
r3,24000.0
r4,
r5,25000.0
r6,25000.0
r7,22000.0


In [12]:
df[df['Fee'] == 22000]

Unnamed: 0,Courses,Fee,Duration,Discount
r0,Spark,22000.0,30day,1000
r7,Python,22000.0,50days,1600


In [13]:
df[['Fee']] - 500

Unnamed: 0,Fee
r0,21500.0
r1,24500.0
r2,22500.0
r3,23500.0
r4,
r5,24500.0
r6,24500.0
r7,21500.0


In [14]:
df.apply(lambda x: x[df['Fee'] > 0])

Unnamed: 0,Courses,Fee,Duration,Discount
r0,Spark,22000.0,30day,1000
r1,PySpark,25000.0,50days,2300
r2,Hadoop,23000.0,55days,1000
r3,Python,24000.0,40days,1200
r5,,25000.0,35day,1300
r6,Spark,25000.0,,1400
r7,Python,22000.0,50days,1600


In [15]:
df['Fee'].apply(lambda x: x if x >=0 else 0)

r0    22000.0
r1    25000.0
r2    23000.0
r3    24000.0
r4        0.0
r5    25000.0
r6    25000.0
r7    22000.0
Name: Fee, dtype: float64

**Basic Methods of Rows Filtering**

In [16]:
df[6:]

Unnamed: 0,Courses,Fee,Duration,Discount
r6,Spark,25000.0,,1400
r7,Python,22000.0,50days,1600


In [17]:
df[:6]

Unnamed: 0,Courses,Fee,Duration,Discount
r0,Spark,22000.0,30day,1000
r1,PySpark,25000.0,50days,2300
r2,Hadoop,23000.0,55days,1000
r3,Python,24000.0,40days,1200
r4,Pandas,,60days,2500
r5,,25000.0,35day,1300


In [18]:
df[6:6]

Unnamed: 0,Courses,Fee,Duration,Discount


In [19]:
df[6:7]

Unnamed: 0,Courses,Fee,Duration,Discount
r6,Spark,25000.0,,1400


In [20]:
df[:3]

Unnamed: 0,Courses,Fee,Duration,Discount
r0,Spark,22000.0,30day,1000
r1,PySpark,25000.0,50days,2300
r2,Hadoop,23000.0,55days,1000


In [21]:
df[:-3]

Unnamed: 0,Courses,Fee,Duration,Discount
r0,Spark,22000.0,30day,1000
r1,PySpark,25000.0,50days,2300
r2,Hadoop,23000.0,55days,1000
r3,Python,24000.0,40days,1200
r4,Pandas,,60days,2500


In [22]:
df[3:-3]

Unnamed: 0,Courses,Fee,Duration,Discount
r3,Python,24000.0,40days,1200
r4,Pandas,,60days,2500


In [23]:
df[:]['Discount'] # return Series

r0    1000
r1    2300
r2    1000
r3    1200
r4    2500
r5    1300
r6    1400
r7    1600
Name: Discount, dtype: int64

In [24]:
df[:][['Discount']] # return Dataframe

Unnamed: 0,Discount
r0,1000
r1,2300
r2,1000
r3,1200
r4,2500
r5,1300
r6,1400
r7,1600


In [25]:
df[6:7][['Discount']]

Unnamed: 0,Discount
r6,1400


In [26]:
df[6:7][['Discount','Courses']]

Unnamed: 0,Discount,Courses
r6,1400,Spark


**Basic Methods for Creating a New Column**

In [27]:
df[['Special_Price']] = df[['Fee']] - 500
df

Unnamed: 0,Courses,Fee,Duration,Discount,Special_Price
r0,Spark,22000.0,30day,1000,21500.0
r1,PySpark,25000.0,50days,2300,24500.0
r2,Hadoop,23000.0,55days,1000,22500.0
r3,Python,24000.0,40days,1200,23500.0
r4,Pandas,,60days,2500,
r5,,25000.0,35day,1300,24500.0
r6,Spark,25000.0,,1400,24500.0
r7,Python,22000.0,50days,1600,21500.0


**TEST**

In [28]:
df[['Fee','Special_Price']]

Unnamed: 0,Fee,Special_Price
r0,22000.0,21500.0
r1,25000.0,24500.0
r2,23000.0,22500.0
r3,24000.0,23500.0
r4,,
r5,25000.0,24500.0
r6,25000.0,24500.0
r7,22000.0,21500.0


In [29]:
df['r3':'r5']

Unnamed: 0,Courses,Fee,Duration,Discount,Special_Price
r3,Python,24000.0,40days,1200,23500.0
r4,Pandas,,60days,2500,
r5,,25000.0,35day,1300,24500.0


In [30]:
df['r3':]

Unnamed: 0,Courses,Fee,Duration,Discount,Special_Price
r3,Python,24000.0,40days,1200,23500.0
r4,Pandas,,60days,2500,
r5,,25000.0,35day,1300,24500.0
r6,Spark,25000.0,,1400,24500.0
r7,Python,22000.0,50days,1600,21500.0


In [31]:
test_pd = pd.DataFrame({'Province':['กาญจนบุรี','เพชรบุรี','ราชบุรี']\
                        ,'Lastname':[80,40,0]},index=['1','2','3'])

In [32]:
test_pd

Unnamed: 0,Province,Lastname
1,กาญจนบุรี,80
2,เพชรบุรี,40
3,ราชบุรี,0


In [33]:
test_pd[1:]

Unnamed: 0,Province,Lastname
2,เพชรบุรี,40
3,ราชบุรี,0


In [34]:
test_pd['1':]

Unnamed: 0,Province,Lastname
1,กาญจนบุรี,80
2,เพชรบุรี,40
3,ราชบุรี,0
