# Pandas

In [1]:
#import pandas module
import pandas as pd

**Creating a Dataframe**

In [90]:
# Creating by passing a dictionary {}
# Key is column name
# Value is a list 

students = pd.DataFrame({
    'name':['Harry Potter','Ron Weasley','Hermione Granger','Luna Lovegood','Susan Bones','Draco Malfoy','Neville Longbottom'],
    'house':['Gryffindor','Gryffindor','Gryffindor','Ravenclaw','Hufflepuff','Slytherin','Gryffindor'],
    'age': [11,11,10,11,12,12,11],
    'blood':['Half-Blood','Pure-Blood','Muggle-born','Pure-Blood','Half-Blood','Pure-Blood','Pure-Blood']
})

In [59]:
print(students)

               name       house  age        blood
0      Harry Potter  Gryffindor   11   Half-Blood
1       Ron Weasley  Gryffindor   11   Pure-Blood
2  Hermione Granger  Gryffindor   10  Muggle-born
3     Luna Lovegood   Ravenclaw   12   Pure-Blood
4       Susan Bones  Hufflepuff   12   Half-Blood


In [79]:
# Creating dataframe by passing lists []
# one list represents one row 
# Columns argument to list column names

head_house = pd.DataFrame([
    ['Minerva McGonagall','Gryffindor','Transfiguration','F'],
    ['Severus Snape','Slytherin','Potions','M'],
    ['Pomona Spout','Hufflepuff','Herbology','F'],
    ['Filius Flitwik','Ravenclaw','Charms','M']],
    columns=['Name','House','Subject','Gender']
)

In [80]:
print(head_house)

                 Name       House          Subject Gender
0  Minerva McGonagall  Gryffindor  Transfiguration      F
1       Severus Snape   Slytherin          Potions      M
2        Pomona Spout  Hufflepuff        Herbology      F
3      Filius Flitwik   Ravenclaw           Charms      M


In [135]:
# Load a csv file
# https://www.kaggle.com/nehatiwari03/harry-potter-fanfiction-data
# The dataset was too large. Only extracted top 50 rows for this example
hp= pd.read_csv("hp_topfifty.csv")

In [136]:
hp.head(2)

Unnamed: 0.1,Unnamed: 0,Chapters,Favs,Follows,Published,Reviews,Updated,Words,author,characters,genre,language,rating,story_link,synopsis,title,published_mmyy,pairing
0,0,1,2.0,,12/31/2019,1.0,,6840,reviews,"Sirius B., Remus L., James P., Regulus B.",Angst/Hurt/Comfort,English,T,https://www.fanfiction.net/s/13466909/1/If-You...,Regulus and James aren't happy. They know they...,If You Change Your Mind,12-2019,
1,1,1,1.0,,12/31/2019,,,10962,JoyI9199,"Harry P., Draco M., Narcissa M., Charlie W.",Angst/Drama,English,M,https://www.fanfiction.net/s/13466894/1/Bloody...,When a plot from the Founder's age is revealed...,Bloody Ballgowns,12-2019,


**Select Columns Rows of a DataFrame**

In [60]:
students

Unnamed: 0,name,house,age,blood
0,Harry Potter,Gryffindor,11,Half-Blood
1,Ron Weasley,Gryffindor,11,Pure-Blood
2,Hermione Granger,Gryffindor,10,Muggle-born
3,Luna Lovegood,Ravenclaw,12,Pure-Blood
4,Susan Bones,Hufflepuff,12,Half-Blood


In [61]:
# This is a DataFrame
type(students)

pandas.core.frame.DataFrame

In [62]:
# Selecting one column of a dataframe
name_col = students.name

In [63]:
print(name_col)

0        Harry Potter
1         Ron Weasley
2    Hermione Granger
3       Luna Lovegood
4         Susan Bones
Name: name, dtype: object


In [64]:
# This is not a dataframe just a column
type(name_col)

pandas.core.series.Series

In [65]:
# Selecting more than one column of DataFrame
student_n_age = students[['name','age']]

In [66]:
print(student_n_age)

               name  age
0      Harry Potter   11
1       Ron Weasley   11
2  Hermione Granger   10
3     Luna Lovegood   12
4       Susan Bones   12


In [67]:
# more than one column - Type is a DataFrame
type(student_n_age)

pandas.core.frame.DataFrame

In [68]:
# Select a row from dataframe
s4 = students.iloc[3]

In [69]:
# Starts at index 0, Luna is on Row 4 but index 3
s4

name     Luna Lovegood
house        Ravenclaw
age                 12
blood       Pure-Blood
Name: 3, dtype: object

In [70]:
# series type
type(s4)

pandas.core.series.Series

In [71]:
# Select more than one row . Will not include the ending index ie 3 in this case
s_more = students.iloc[0:3]

In [72]:
s_more

Unnamed: 0,name,house,age,blood
0,Harry Potter,Gryffindor,11,Half-Blood
1,Ron Weasley,Gryffindor,11,Pure-Blood
2,Hermione Granger,Gryffindor,10,Muggle-born


In [73]:
# Dataframe type
type(s_more)

pandas.core.frame.DataFrame

In [74]:
students.iloc[:3]

Unnamed: 0,name,house,age,blood
0,Harry Potter,Gryffindor,11,Half-Blood
1,Ron Weasley,Gryffindor,11,Pure-Blood
2,Hermione Granger,Gryffindor,10,Muggle-born


In [75]:
students.iloc[3:]

Unnamed: 0,name,house,age,blood
3,Luna Lovegood,Ravenclaw,12,Pure-Blood
4,Susan Bones,Hufflepuff,12,Half-Blood


In [76]:
students.iloc[-2:]

Unnamed: 0,name,house,age,blood
3,Luna Lovegood,Ravenclaw,12,Pure-Blood
4,Susan Bones,Hufflepuff,12,Half-Blood


In [77]:
students.iloc[:-2]

Unnamed: 0,name,house,age,blood
0,Harry Potter,Gryffindor,11,Half-Blood
1,Ron Weasley,Gryffindor,11,Pure-Blood
2,Hermione Granger,Gryffindor,10,Muggle-born


In [78]:
students.iloc[-4:-1]

Unnamed: 0,name,house,age,blood
1,Ron Weasley,Gryffindor,11,Pure-Blood
2,Hermione Granger,Gryffindor,10,Muggle-born
3,Luna Lovegood,Ravenclaw,12,Pure-Blood


In [82]:
# select rows that match a condition
gryffindor = students[students.house == 'Gryffindor']

In [84]:
print(gryffindor)

               name       house  age        blood
0      Harry Potter  Gryffindor   11   Half-Blood
1       Ron Weasley  Gryffindor   11   Pure-Blood
2  Hermione Granger  Gryffindor   10  Muggle-born


In [87]:
not_gryffindor = students[students.house != 'Gryffindor']

In [105]:
not_gryffindor

Unnamed: 0,name,house,age,blood
3,Luna Lovegood,Ravenclaw,11,Pure-Blood
4,Susan Bones,Hufflepuff,12,Half-Blood
5,Draco Malfoy,Slytherin,12,Pure-Blood


In [106]:
# more than one condition using & 
y_gryffindor = students[(students.house == 'Gryffindor') & (students.blood == 'Pure-Blood')]

In [107]:
y_gryffindor

Unnamed: 0,name,house,age,blood
1,Ron Weasley,Gryffindor,11,Pure-Blood
6,Neville Longbottom,Gryffindor,11,Pure-Blood


In [120]:
# more than one condition using | pipe - it means or
y_g = students[(students.house == 'Gryffindor') | (students.age < 12)]

In [121]:
y_g

Unnamed: 0,name,house,age,blood
0,Harry Potter,Gryffindor,11,Half-Blood
1,Ron Weasley,Gryffindor,11,Pure-Blood
2,Hermione Granger,Gryffindor,10,Muggle-born
3,Luna Lovegood,Ravenclaw,11,Pure-Blood
6,Neville Longbottom,Gryffindor,11,Pure-Blood


In [110]:
# Select rows using isin . Returns the matching rows
students[students.blood.isin(['Pure-Blood','Half-Blood'])]

Unnamed: 0,name,house,age,blood
0,Harry Potter,Gryffindor,11,Half-Blood
1,Ron Weasley,Gryffindor,11,Pure-Blood
3,Luna Lovegood,Ravenclaw,11,Pure-Blood
4,Susan Bones,Hufflepuff,12,Half-Blood
5,Draco Malfoy,Slytherin,12,Pure-Blood
6,Neville Longbottom,Gryffindor,11,Pure-Blood


**Reset Index**

In [124]:
# the index is not in order as this is just a selected rows
y_g

Unnamed: 0,name,house,age,blood
0,Harry Potter,Gryffindor,11,Half-Blood
1,Ron Weasley,Gryffindor,11,Pure-Blood
2,Hermione Granger,Gryffindor,10,Muggle-born
3,Luna Lovegood,Ravenclaw,11,Pure-Blood
6,Neville Longbottom,Gryffindor,11,Pure-Blood


In [123]:
# This will create a new index but will keep the new index. ANd this does not change the original dataframe index
y_g.reset_index()

Unnamed: 0,index,name,house,age,blood
0,0,Harry Potter,Gryffindor,11,Half-Blood
1,1,Ron Weasley,Gryffindor,11,Pure-Blood
2,2,Hermione Granger,Gryffindor,10,Muggle-born
3,3,Luna Lovegood,Ravenclaw,11,Pure-Blood
4,6,Neville Longbottom,Gryffindor,11,Pure-Blood


In [126]:
# index dint change
y_g

Unnamed: 0,name,house,age,blood
0,Harry Potter,Gryffindor,11,Half-Blood
1,Ron Weasley,Gryffindor,11,Pure-Blood
2,Hermione Granger,Gryffindor,10,Muggle-born
3,Luna Lovegood,Ravenclaw,11,Pure-Blood
6,Neville Longbottom,Gryffindor,11,Pure-Blood


In [127]:
# This will drop the index column that has old index number. But will not change the original
y_g.reset_index(drop=True)

Unnamed: 0,name,house,age,blood
0,Harry Potter,Gryffindor,11,Half-Blood
1,Ron Weasley,Gryffindor,11,Pure-Blood
2,Hermione Granger,Gryffindor,10,Muggle-born
3,Luna Lovegood,Ravenclaw,11,Pure-Blood
4,Neville Longbottom,Gryffindor,11,Pure-Blood


In [128]:
y_g

Unnamed: 0,name,house,age,blood
0,Harry Potter,Gryffindor,11,Half-Blood
1,Ron Weasley,Gryffindor,11,Pure-Blood
2,Hermione Granger,Gryffindor,10,Muggle-born
3,Luna Lovegood,Ravenclaw,11,Pure-Blood
6,Neville Longbottom,Gryffindor,11,Pure-Blood


In [129]:
# This will change the index in the original dataframe
y_g.reset_index(drop=True,inplace=True)

In [130]:
y_g

Unnamed: 0,name,house,age,blood
0,Harry Potter,Gryffindor,11,Half-Blood
1,Ron Weasley,Gryffindor,11,Pure-Blood
2,Hermione Granger,Gryffindor,10,Muggle-born
3,Luna Lovegood,Ravenclaw,11,Pure-Blood
4,Neville Longbottom,Gryffindor,11,Pure-Blood
