#Pandas Tutorial to Manipulate DataFrame

In [0]:
# Create DataFrame with None/Null to work with examples
import pandas as pd
import numpy as np
technologies   = ({
    'Courses':["Spark","PySpark","Hadoop","Python","Pandas",None,"Spark","Python"],
    'Fee' :[22000,25000,23000,24000,np.nan,25000,25000,22000],
    'Duration':['30day','50days','55days','40days','60days','35day','','50days'],
    'Discount':[1000,2300,1000,1200,2500,1300,1400,1600]
          })
row_labels=['r0','r1','r2','r3','r4','r5','r6','r7']
df = pd.DataFrame(technologies, index=row_labels)
print(df)

    Courses      Fee Duration  Discount
r0    Spark  22000.0    30day      1000
r1  PySpark  25000.0   50days      2300
r2   Hadoop  23000.0   55days      1000
r3   Python  24000.0   40days      1200
r4   Pandas      NaN   60days      2500
r5     None  25000.0    35day      1300
r6    Spark  25000.0               1400
r7   Python  22000.0   50days      1600


##1 Describe DataFrame

---

- **describe() – describe function calculates count, mean, std, min, max, and different percentages of each numeric column of pandas DataFrame.**

In [0]:
# Describe DataFrame for all numberic columns

df.describe()

Unnamed: 0,Fee,Discount
count,7.0,8.0
mean,23714.285714,1537.5
std,1380.131119,570.557372
min,22000.0,1000.0
25%,22500.0,1150.0
50%,24000.0,1350.0
75%,25000.0,1775.0
max,25000.0,2500.0


##2 Filter Rows from DataFrame

---

###query()/apply()/loc[] – 
**These are used to query pandas DataFrame. you can also do operator chaining while filtering pandas rows.**


---


- pandas.DataFrame.filter() – To filter rows by index and columns by name.
- pandas.DataFrame.loc[] – To select rows by indices label and column by name.
- pandas.DataFrame.iloc[] – To select rows by index and column by position.
- pandas.DataFrame.apply() – To custom select using lambda function.

In [0]:
# Using DataFrame.query()

df1 = df.copy()
df1.query("Courses == 'Spark'", inplace=True)
df1

Unnamed: 0,Courses,Fee,Duration,Discount
r0,Spark,22000.0,30day,1000
r6,Spark,25000.0,,1400


In [0]:
df.query("Courses != 'Spark'")


Unnamed: 0,Courses,Fee,Duration,Discount
r1,PySpark,25000.0,50days,2300
r2,Hadoop,23000.0,55days,1000
r3,Python,24000.0,40days,1200
r4,Pandas,,60days,2500
r5,,25000.0,35day,1300
r7,Python,22000.0,50days,1600


In [0]:
df.query("Courses in ('Spark', 'PySpark')")

Unnamed: 0,Courses,Fee,Duration,Discount
r0,Spark,22000.0,30day,1000
r1,PySpark,25000.0,50days,2300
r6,Spark,25000.0,,1400


In [0]:
df.query("Fee >= 23000 and Fee <= 24000")

Unnamed: 0,Courses,Fee,Duration,Discount
r2,Hadoop,23000.0,55days,1000
r3,Python,24000.0,40days,1200


In [0]:
# Using DataFrame.loc[]
#df.loc[df['Courses']==value]

df.loc[df['Courses']!='Spark']

Unnamed: 0,Courses,Fee,Duration,Discount
r1,PySpark,25000.0,50days,2300
r2,Hadoop,23000.0,55days,1000
r3,Python,24000.0,40days,1200
r4,Pandas,,60days,2500
r5,,25000.0,35day,1300
r7,Python,22000.0,50days,1600


In [0]:
# df.loc[df['Courses'].isin(values)]

df.loc[df['Courses'].isin(['Spark', 'Python'])]

Unnamed: 0,Courses,Fee,Duration,Discount
r0,Spark,22000.0,30day,1000
r3,Python,24000.0,40days,1200
r6,Spark,25000.0,,1400
r7,Python,22000.0,50days,1600


In [0]:
# df.loc[~df['Courses'].isin(values)]

df.loc[~df['Courses'].isin(['Spark', 'Python'])]

Unnamed: 0,Courses,Fee,Duration,Discount
r1,PySpark,25000.0,50days,2300
r2,Hadoop,23000.0,55days,1000
r4,Pandas,,60days,2500
r5,,25000.0,35day,1300


In [0]:
df.loc[(df['Discount'] >= 1000) & (df['Discount'] <= 2000)]

Unnamed: 0,Courses,Fee,Duration,Discount
r0,Spark,22000.0,30day,1000
r2,Hadoop,23000.0,55days,1000
r3,Python,24000.0,40days,1200
r5,,25000.0,35day,1300
r6,Spark,25000.0,,1400
r7,Python,22000.0,50days,1600


In [0]:
df.loc[(df['Discount'] >=1200) & (df['Fee'] >= 23000)]

Unnamed: 0,Courses,Fee,Duration,Discount
r1,PySpark,25000.0,50days,2300
r3,Python,24000.0,40days,1200
r5,,25000.0,35day,1300
r6,Spark,25000.0,,1400


In [0]:
##Using apply()

df.apply(lambda row: row[df['Courses'].isin(['Spark', 'PySpark'])])

Unnamed: 0,Courses,Fee,Duration,Discount
r0,Spark,22000.0,30day,1000
r1,PySpark,25000.0,50days,2300
r6,Spark,25000.0,,1400


In [0]:
# Other ways to filter 
df[df['Courses'] == 'Spark']

Unnamed: 0,Courses,Fee,Duration,Discount
r0,Spark,22000.0,30day,1000
r6,Spark,25000.0,,1400


In [0]:
df[:3][df[:3]['Courses'].str.contains("Spark")]

Unnamed: 0,Courses,Fee,Duration,Discount
r0,Spark,22000.0,30day,1000
r1,PySpark,25000.0,50days,2300


In [0]:
df[df['Courses'].str.contains("Spark", na=False)]

Unnamed: 0,Courses,Fee,Duration,Discount
r0,Spark,22000.0,30day,1000
r1,PySpark,25000.0,50days,2300
r6,Spark,25000.0,,1400


In [0]:
df[df['Courses'].str.lower().str.contains("spark", na=False)]


Unnamed: 0,Courses,Fee,Duration,Discount
r0,Spark,22000.0,30day,1000
r1,PySpark,25000.0,50days,2300
r6,Spark,25000.0,,1400


In [0]:
df[df['Courses'].str.startswith("P", na=False)]


Unnamed: 0,Courses,Fee,Duration,Discount
r1,PySpark,25000.0,50days,2300
r3,Python,24000.0,40days,1200
r4,Pandas,,60days,2500
r7,Python,22000.0,50days,1600


##3 Insert Rows & Columns to DataFrame

---

- **insert()/assign()** – Adds a new column to the pandas DataFrame

**By using assign() & insert() methods you can add one or multiple columns to the pandas DataFrame.**

In [0]:
df = pd.DataFrame(technologies, index=row_labels)

In [0]:
# Adds new column 'TutorsAssigned' to DataFrame

tutors = ['William', 'Henry', 'Michael', 'John', 'Messi', 'Ramamna', 'Kumar', 'Vasu']

df2 = df.assign(TutorsAssigned = tutors)
df2

Unnamed: 0,Courses,Fee,Duration,Discount,TutorsAssigned
r0,Spark,22000.0,30day,1000,William
r1,PySpark,25000.0,50days,2300,Henry
r2,Hadoop,23000.0,55days,1000,Michael
r3,Python,24000.0,40days,1200,John
r4,Pandas,,60days,2500,Messi
r5,,25000.0,35day,1300,Ramamna
r6,Spark,25000.0,,1400,Kumar
r7,Python,22000.0,50days,1600,Vasu


In [0]:
# Add new column from existing column
df2 = df.assign(Discount_Percent=lambda x: x.Fee * x.Discount / 100)
df2

Unnamed: 0,Courses,Fee,Duration,Discount,Discount_Percent
r0,Spark,22000.0,30day,1000,220000.0
r1,PySpark,25000.0,50days,2300,575000.0
r2,Hadoop,23000.0,55days,1000,230000.0
r3,Python,24000.0,40days,1200,288000.0
r4,Pandas,,60days,2500,
r5,,25000.0,35day,1300,325000.0
r6,Spark,25000.0,,1400,350000.0
r7,Python,22000.0,50days,1600,352000.0


In [0]:
#Other way to add a column
df['TutorsAssigned'] = tutors
df

Unnamed: 0,Courses,Fee,Duration,Discount,TutorsAssigned
r0,Spark,22000.0,30day,1000,William
r1,PySpark,25000.0,50days,2300,Henry
r2,Hadoop,23000.0,55days,1000,Michael
r3,Python,24000.0,40days,1200,John
r4,Pandas,,60days,2500,Messi
r5,,25000.0,35day,1300,Ramamna
r6,Spark,25000.0,,1400,Kumar
r7,Python,22000.0,50days,1600,Vasu


In [0]:
# Add new column at the beginning
df.insert(0, 'TutorsAssigned', tutors, allow_duplicates=True)

In [0]:
df

Unnamed: 0,TutorsAssigned,Courses,Fee,Duration,Discount,TutorsAssigned.1
r0,William,Spark,22000.0,30day,1000,William
r1,Henry,PySpark,25000.0,50days,2300,Henry
r2,Michael,Hadoop,23000.0,55days,1000,Michael
r3,John,Python,24000.0,40days,1200,John
r4,Messi,Pandas,,60days,2500,Messi
r5,Ramamna,,25000.0,35day,1300,Ramamna
r6,Kumar,Spark,25000.0,,1400,Kumar
r7,Vasu,Python,22000.0,50days,1600,Vasu


##4 Rename DataFrame Columns

- **rename()** – Renames pandas DataFrame columns

**Pandas DataFrame.rename() method is used to change/replace columns (single & multiple columns), by index, and all columns of the DataFrame.**

In [0]:
df = pd.DataFrame(technologies, index=row_labels)

In [0]:
# Assign new header by setting new column names.
print(f'Old Columns : {df.columns}')

df.columns = ['A', 'B', 'C', 'D']

Old Columns : Index(['Courses', 'Fee', 'Duration', 'Discount'], dtype='object')


In [0]:
df

Unnamed: 0,A,B,C,D
r0,Spark,22000.0,30day,1000
r1,PySpark,25000.0,50days,2300
r2,Hadoop,23000.0,55days,1000
r3,Python,24000.0,40days,1200
r4,Pandas,,60days,2500
r5,,25000.0,35day,1300
r6,Spark,25000.0,,1400
r7,Python,22000.0,50days,1600


In [0]:
# Change column name by index. This changes 3rd column
df.columns.values[2] = 'E'

In [0]:
df

Unnamed: 0,A,B,E,D
r0,Spark,22000.0,30day,1000
r1,PySpark,25000.0,50days,2300
r2,Hadoop,23000.0,55days,1000
r3,Python,24000.0,40days,1200
r4,Pandas,,60days,2500
r5,,25000.0,35day,1300
r6,Spark,25000.0,,1400
r7,Python,22000.0,50days,1600


In [0]:
# Rename Column Names using rename() method

df2 = df.rename({'A':'a', 'B':'b'}, axis=1)
df2

Unnamed: 0,a,b,E,D
r0,Spark,22000.0,30day,1000
r1,PySpark,25000.0,50days,2300
r2,Hadoop,23000.0,55days,1000
r3,Python,24000.0,40days,1200
r4,Pandas,,60days,2500
r5,,25000.0,35day,1300
r6,Spark,25000.0,,1400
r7,Python,22000.0,50days,1600


In [0]:
df2 = df.rename({'a':'A', 'b':'B', 'E':'C'}, axis='columns')
df2

Unnamed: 0,A,B,C,D
r0,Spark,22000.0,30day,1000
r1,PySpark,25000.0,50days,2300
r2,Hadoop,23000.0,55days,1000
r3,Python,24000.0,40days,1200
r4,Pandas,,60days,2500
r5,,25000.0,35day,1300
r6,Spark,25000.0,,1400
r7,Python,22000.0,50days,1600


In [0]:
df2 = df.rename(columns={'A':'a', 'B':'b'})
df2

Unnamed: 0,a,b,E,D
r0,Spark,22000.0,30day,1000
r1,PySpark,25000.0,50days,2300
r2,Hadoop,23000.0,55days,1000
r3,Python,24000.0,40days,1200
r4,Pandas,,60days,2500
r5,,25000.0,35day,1300
r6,Spark,25000.0,,1400
r7,Python,22000.0,50days,1600


In [0]:
df

Unnamed: 0,A,B,E,D
r0,Spark,22000.0,30day,1000
r1,PySpark,25000.0,50days,2300
r2,Hadoop,23000.0,55days,1000
r3,Python,24000.0,40days,1200
r4,Pandas,,60days,2500
r5,,25000.0,35day,1300
r6,Spark,25000.0,,1400
r7,Python,22000.0,50days,1600


In [0]:
# Rename columns inplace (self DataFrame)
df.rename(columns={'A':'a', 'B':'b', 'E':'c'}, inplace=True)

In [0]:
df

Unnamed: 0,a,b,c,D
r0,Spark,22000.0,30day,1000
r1,PySpark,25000.0,50days,2300
r2,Hadoop,23000.0,55days,1000
r3,Python,24000.0,40days,1200
r4,Pandas,,60days,2500
r5,,25000.0,35day,1300
r6,Spark,25000.0,,1400
r7,Python,22000.0,50days,1600


In [0]:
df = pd.DataFrame(technologies, index=row_labels)

In [0]:
# Rename using lambda function
df.rename(columns=lambda x: x[1:], inplace=True)

In [0]:
df

Unnamed: 0,ourses,ee,uration,iscount
r0,Spark,22000.0,30day,1000
r1,PySpark,25000.0,50days,2300
r2,Hadoop,23000.0,55days,1000
r3,Python,24000.0,40days,1200
r4,Pandas,,60days,2500
r5,,25000.0,35day,1300
r6,Spark,25000.0,,1400
r7,Python,22000.0,50days,1600


##5 Drop DataFrame Rows and Columns


---


- **drop()** – drop method is used to drop rows and columns

**Below are some examples. In order to understand better go through drop rows from panda DataFrame with examples. dropping rows doesn’t complete without learning how to drop rows with/by condition**

In [0]:
df = pd.DataFrame(technologies, index=row_labels)

In [0]:
df

Unnamed: 0,Courses,Fee,Duration,Discount
r0,Spark,22000.0,30day,1000
r1,PySpark,25000.0,50days,2300
r2,Hadoop,23000.0,55days,1000
r3,Python,24000.0,40days,1200
r4,Pandas,,60days,2500
r5,,25000.0,35day,1300
r6,Spark,25000.0,,1400
r7,Python,22000.0,50days,1600


In [0]:
# Drop rows by labels
df1 = df.drop(['r1', 'r2'])

In [0]:
df1

Unnamed: 0,Courses,Fee,Duration,Discount
r0,Spark,22000.0,30day,1000
r3,Python,24000.0,40days,1200
r4,Pandas,,60days,2500
r5,,25000.0,35day,1300
r6,Spark,25000.0,,1400
r7,Python,22000.0,50days,1600


In [0]:
# Delete Rows by position
df1 = df.drop(df.index[[1,3]])

In [0]:
df1

Unnamed: 0,Courses,Fee,Duration,Discount
r0,Spark,22000.0,30day,1000
r2,Hadoop,23000.0,55days,1000
r4,Pandas,,60days,2500
r5,,25000.0,35day,1300
r6,Spark,25000.0,,1400
r7,Python,22000.0,50days,1600


In [0]:
# Delete Rows by Index Range
df1 = df.drop(df.index[2:])

In [0]:
df1

Unnamed: 0,Courses,Fee,Duration,Discount
r0,Spark,22000.0,30day,1000
r1,PySpark,25000.0,50days,2300


In [0]:
# When you have default indexs for rows
df1 = df.reset_index(drop=True)
df1

Unnamed: 0,Courses,Fee,Duration,Discount
0,Spark,22000.0,30day,1000
1,PySpark,25000.0,50days,2300
2,Hadoop,23000.0,55days,1000
3,Python,24000.0,40days,1200
4,Pandas,,60days,2500
5,,25000.0,35day,1300
6,Spark,25000.0,,1400
7,Python,22000.0,50days,1600


In [0]:
df1 = df1.drop(0)
df1

Unnamed: 0,Courses,Fee,Duration,Discount
1,PySpark,25000.0,50days,2300
2,Hadoop,23000.0,55days,1000
3,Python,24000.0,40days,1200
4,Pandas,,60days,2500
5,,25000.0,35day,1300
6,Spark,25000.0,,1400
7,Python,22000.0,50days,1600


In [0]:
df1 = df.reset_index(drop=True)
df2 = df1.drop([0,3])
df2

Unnamed: 0,Courses,Fee,Duration,Discount
1,PySpark,25000.0,50days,2300
2,Hadoop,23000.0,55days,1000
4,Pandas,,60days,2500
5,,25000.0,35day,1300
6,Spark,25000.0,,1400
7,Python,22000.0,50days,1600


In [0]:
df2 = df1.drop(range(0,2))
df2

Unnamed: 0,Courses,Fee,Duration,Discount
2,Hadoop,23000.0,55days,1000
3,Python,24000.0,40days,1200
4,Pandas,,60days,2500
5,,25000.0,35day,1300
6,Spark,25000.0,,1400
7,Python,22000.0,50days,1600


##Now let’s see how to how to drop columns from pandas DataFrame with examples. In order to drop columns, you have to use either axis=1 or columns param to drop() method.

In [0]:
df = pd.DataFrame(technologies, index=row_labels)

In [0]:
# Delete Column by Name
df2 = df.drop(['Fee'], axis=1)
df2

Unnamed: 0,Courses,Duration,Discount
r0,Spark,30day,1000
r1,PySpark,50days,2300
r2,Hadoop,55days,1000
r3,Python,40days,1200
r4,Pandas,60days,2500
r5,,35day,1300
r6,Spark,,1400
r7,Python,50days,1600


In [0]:
# Drop by using labels & axis
df2 = df.drop(labels=['Fee'], axis=1)
df2

Unnamed: 0,Courses,Duration,Discount
r0,Spark,30day,1000
r1,PySpark,50days,2300
r2,Hadoop,55days,1000
r3,Python,40days,1200
r4,Pandas,60days,2500
r5,,35day,1300
r6,Spark,,1400
r7,Python,50days,1600


In [0]:
# Drop by using columns
df2 = df.drop(columns=["Fee"])
df2

Unnamed: 0,Courses,Duration,Discount
r0,Spark,30day,1000
r1,PySpark,50days,2300
r2,Hadoop,55days,1000
r3,Python,40days,1200
r4,Pandas,60days,2500
r5,,35day,1300
r6,Spark,,1400
r7,Python,50days,1600


In [0]:
#Drop column by index
df2 = df.drop(columns=df.columns[[1]], axis=1)
df2

Unnamed: 0,Courses,Duration,Discount
r0,Spark,30day,1000
r1,PySpark,50days,2300
r2,Hadoop,55days,1000
r3,Python,40days,1200
r4,Pandas,60days,2500
r5,,35day,1300
r6,Spark,,1400
r7,Python,50days,1600


In [0]:
# Other ways to drop columns
df1 = df.copy()
df1.drop(df.loc[:, 'Courses':'Fee'].columns, axis=1, inplace=True)

In [0]:
df1

Unnamed: 0,Duration,Discount
r0,30day,1000
r1,50days,2300
r2,55days,1000
r3,40days,1200
r4,60days,2500
r5,35day,1300
r6,,1400
r7,50days,1600


In [0]:
df1 = df.copy()
df1.drop(df.iloc[:, 1:2], axis=1, inplace=True)
df1

Unnamed: 0,Courses,Duration,Discount
r0,Spark,30day,1000
r1,PySpark,50days,2300
r2,Hadoop,55days,1000
r3,Python,40days,1200
r4,Pandas,60days,2500
r5,,35day,1300
r6,Spark,,1400
r7,Python,50days,1600


In [0]:
df.loc[:,'Courses':'Fee'].columns

Out[334]: Index(['Courses', 'Fee'], dtype='object')