# DataFrame Data Structure

In [None]:
# this is the primary data structure we will work with in data analysis and cleaning tasks
# a dataframe has multiple rows and columns (2D data structure)

In [4]:
import pandas as pd

In [73]:
# example

r1 = pd.Series({'Name' : 'Yash',
                'Class' : 'Physics',
                'Score' : 85})
r2 = pd.Series({'Name' : 'Hrisheka',
                'Class' : 'Biology',
                'Score' : 90})
r3 = pd.Series({'Name' : 'Rahul',
               'Class' : 'Mathematics',
               'Score' : 75})

# now we will represent this data as a dataframe, where one series represents one row of data
df = pd.DataFrame([r1, r2, r3], index = ['school1', 'school2', 'school1'])
df.head()

Name        Yash
Class    Physics
Score         85
dtype: object


Unnamed: 0,Name,Class,Score
school1,Yash,Physics,85
school2,Hrisheka,Biology,90
school1,Rahul,Mathematics,75


In [11]:
# we can also do this by making a list of dictionaries
students = [{'Name' : 'Yash',
            'Class' : 'Physics',
            'Score' : 85},
            {'Name' : 'Hrisheka',
            'Class' : 'Biology',
            'Score' : 90},
            {'Name' : 'Rahul',
            'Class' : 'Mathematics',
            'Score' : 75}]

df = pd.DataFrame(students, index = ['school1', 'school2', 'school1'])
df.head()

Unnamed: 0,Name,Class,Score
school1,Yash,Physics,85
school2,Hrisheka,Biology,90
school1,Rahul,Mathematics,75


In [15]:
# extracting data from a DataFrame using .loc and .iloc
print(df.loc['school2']) # returns one row of the df
df.loc['school1'] # returns a dataframe consisting of the requisite rows

Name     Hrisheka
Class     Biology
Score          90
Name: school2, dtype: object


Unnamed: 0,Name,Class,Score
school1,Yash,Physics,85
school1,Rahul,Mathematics,75


In [32]:
# we can also access particular values, using two parameters, one for the row and one for the column
print(df.loc['school1', 'Name'])

school1     Yash
school1    Rahul
Name: Name, dtype: object


In [19]:
# now, if we want to select a single column
df.T # this is used to transpose the matrix
print(df.T.loc['Name'])

school1        Yash
school2    Hrisheka
school1       Rahul
Name: Name, dtype: object


In [24]:
# if the indexing operator is used directly on the dataframe, it implies column selection
print(df['Name'])
# we cannot use .loc with a column name, produces key error

school1        Yash
school2    Hrisheka
school1       Rahul
Name: Name, dtype: object

In [35]:
# in a dataframe, we can also chain operations together
df.loc['school1']['Name']

school1     Yash
school1    Rahul
Name: Name, dtype: object

In [34]:
# we can also rertieve information about multiple columns
df.loc['school1', ['Name', 'Class']]
df.loc[:, ['Name', 'Score']] # ':' implies all rows

Unnamed: 0,Name,Score
school1,Yash,85
school2,Hrisheka,90
school1,Rahul,75


In [38]:
# we can delete data from a dataframe using the drop function. this returns an altered df but does not change the underlying df
print(df.drop('school1')) # two rows removed
print(df) # unaltered

             Name    Class  Score
school2  Hrisheka  Biology     90


Unnamed: 0,Name,Class,Score
school1,Yash,Physics,85
school2,Hrisheka,Biology,90
school1,Rahul,Mathematics,75


In [51]:
# drop as two optional parameters - inplace and axis, eg:
df2 = df.copy()
df2.drop('Class',inplace = True, axis = 1) # inplace - alters the unerlying df ; axis - (=1) means columns, 0 by default
df2

Unnamed: 0,Name,Score
school1,Yash,85
school2,Hrisheka,90
school1,Rahul,75


In [52]:
# we can also delete using indexing operators directly, using del
del df2['Name']
df2 # this also alters the underlying df permanently

Unnamed: 0,Score
school1,85
school2,90
school1,75


In [72]:
# we can easily add columns to the dataframe using indexing operators straight to the dataframe
df['ClassRanking'] = None
df

Unnamed: 0,Name,Class,Score,ClassRanking
school1,Yash,Physics,85.0,
school2,Hrisheka,Biology,90.0,
school1,Rahul,Mathematics,75.0,


## DataFrame Indexing and Loading

In [74]:
# now, we will see how to load data from a csv file to a dataframe

#load the csv file
df = pd.read_csv('datasets/admissionpredict.csv')
df.head() # here, we can see that pandas has set its own index to the data (0, 1, 2, 3, 4....)

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [76]:
# we can set our own column as an index
df =pd.read_csv('datasets/admissionpredict.csv', index_col = 0)
df.head() 
# this sets the serial number as the index

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [85]:
# we can rename the columns by passing a dictionary into the rename() function
new_df = df.rename(columns = {'GRE Score' : 'GRE Score',
                              'TOEFL Score' : 'TOEFL Score',
                              'University Rating' : 'University Rating',
                              'SOP' : 'Statemenf of Purpose', 'LOR' : 'Letter of Recommendation', # changing the names of the columns
                              'CGPA' : 'CGPA',
                              'Research' : 'Research',
                              'Chance of Admit' : 'Chance of Admit'})
new_df.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,Statemenf of Purpose,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [87]:
# only the SOP column changed its name. we need to make sure that all the columns are named properly inside the rename() fn
print(df.columns)

# here, we can see that LOR has a space after it, so we type it as is
new_df = new_df.rename(columns = {'LOR ' : 'Letter of Recommendation'})
new_df.head()

Index(['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR ', 'CGPA',
       'Research', 'Chance of Admit '],
      dtype='object')


Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,Statemenf of Purpose,Letter of Recommendation,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [91]:
# this is inefficient, so we rather strip the data off whitespace at the ends using strip(), specifiying whether the column or index needs to be stripped
new_df = new_df.rename(mapper = str.strip, axis = 'columns')
print(new_df.columns) # we can see that 'Chance of Admit' does not have whitespace anymore
new_df.head()

Index(['GRE Score', 'TOEFL Score', 'University Rating', 'Statemenf of Purpose',
       'Letter of Recommendation', 'CGPA', 'Research', 'Chance of Admit'],
      dtype='object')


Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,Statemenf of Purpose,Letter of Recommendation,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [94]:
# example - change all column names to lowercase
cols = list(df.columns)
cols = [x.lower().strip() for x in cols] # list comprehension
df.columns = cols
print(df.columns)
df.head()


Index(['gre score', 'toefl score', 'university rating', 'sop', 'lor', 'cgpa',
       'research', 'chance of admit'],
      dtype='object')


Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65
