# Pandas DataFrames

## creating dataframes

In [1]:
# Dataframes are two dimensional objects with labeled rows and columns.
# Dataframes can be considered a group of labeled lists.
import pandas as pd

In [2]:
# dataframes can be created in many ways: from pandas series, list of dictionaries or imports.

In [3]:
# Creating a dataframe from Pandas.Series. Each series is a ROW of information for the dataframe.

#series from a list and direct index
series1 = pd.Series(["Vale","Mate"],index=["nombre","clase"])
#series from a dictionary
series2 = pd.Series({"nombre":"Diego","clase":"idiomas"})

# creating dataframe from the series

df = pd.DataFrame([series1,series2])
df

Unnamed: 0,nombre,clase
0,Vale,Mate
1,Diego,idiomas


In [4]:
# Creating a dataframe from a LIST of DICTIONARIES
# We can set the indeces manually. almost like n series.

dicts = [{"nombre":"Vale",
         "clase":"Mate"},
         {"nombre":"Diego",
          "clase":"Idiomas"},
        {"nombre":"Ale",
          "clase":"Historia"}]

df = pd.DataFrame(dicts, index=["record1","record2","record1"])
df

Unnamed: 0,nombre,clase
record1,Vale,Mate
record2,Diego,Idiomas
record1,Ale,Historia


In [5]:
# A dataframe can be transposed by using the .T attribute, but this is NOT in place.
df.T

Unnamed: 0,record1,record2,record1.1
nombre,Vale,Diego,Ale
clase,Mate,Idiomas,Historia


## Querying dataframes

In [6]:
# To retrieve a record [ROW] we must use the LOC attribute (NOT A METHOD)
# if there are many ROWS with the same name, the result is a dataframe
df.loc["record1"]

Unnamed: 0,nombre,clase
record1,Vale,Mate
record1,Ale,Historia


In [7]:
# to retrieve a column we use the slicing operator with the column name. if we use slicing with a ROW we get an ERROR.
# this returns a SERIES
df["nombre"]

record1     Vale
record2    Diego
record1      Ale
Name: nombre, dtype: object

In [8]:
# we can also combine operators to retrieve rows AND columns.
# if we want to retrieve many columns we can pass a list of columns.
df.loc["record1",["nombre","clase"]]

Unnamed: 0,nombre,clase
record1,Vale,Mate
record1,Ale,Historia


In [9]:
# another way to chain is
df.loc["record1"][["nombre","clase"]]

Unnamed: 0,nombre,clase
record1,Vale,Mate
record1,Ale,Historia


In [10]:
# The prefered method is to use .loc since it also supports slicing
# This returns all the ROWS along with the nombre column
df.loc[:,["nombre"]]

Unnamed: 0,nombre
record1,Vale
record2,Diego
record1,Ale


## Opening files via csv reader

In [11]:
# loading a csv file to create a dataframe
# .head() method shows the first 5 rows.

df = pd.read_csv("datasets/Admission_Predict.csv")
df.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [12]:
# we can set a default column as index by using the index_col argument
# this promotes serial to INDEX
df = pd.read_csv("datasets/Admission_Predict.csv",index_col=0)
df.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [13]:
# we can retrieve the columns as a LIST by using the column attribute
df.columns

Index(['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR ', 'CGPA',
       'Research', 'Chance of Admit '],
      dtype='object')

In [14]:
# we can also retrieve the index(es) via the .index attribute
df.index

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            391, 392, 393, 394, 395, 396, 397, 398, 399, 400],
           dtype='int64', name='Serial No.', length=400)

In [15]:
# a common operation is to clean column names.
# we can do this by passing the new columns as a LIST directly into the attribute
cols = df.columns
new_cols = [col.strip().lower() for col in cols]
df.columns = new_cols
df.head()

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [16]:
# we can also use the rename method
# this method takes a dictionary as an argument with the key being the old column and the value the new one
# for it to modify INPLACE we must set the argument to true
df.rename(columns={"sop":"statement of purpose","lor":"letter of recommendation"},inplace=True)
df.head()

Unnamed: 0_level_0,gre score,toefl score,university rating,statement of purpose,letter of recommendation,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


## boolean masking

In [17]:
# much like with series, we can pass a comparison to a dataframe column to create a mask

mask = df["cgpa"] > 9

In [18]:
# to apply the mask to the dataframe we can use the WHERE method, almost like SQL
# all non-applicable values are converted into NaN
df.where(mask)

Unnamed: 0_level_0,gre score,toefl score,university rating,statement of purpose,letter of recommendation,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337.0,118.0,4.0,4.5,4.5,9.65,1.0,0.92
2,,,,,,,,
3,,,,,,,,
4,,,,,,,,
5,,,,,,,,
...,...,...,...,...,...,...,...,...
396,324.0,110.0,3.0,3.5,3.5,9.04,1.0,0.82
397,325.0,107.0,3.0,3.0,3.5,9.11,1.0,0.84
398,330.0,116.0,4.0,5.0,4.5,9.45,1.0,0.91
399,,,,,,,,


In [19]:
# to delete the NaNs we can use the dropna() method
df.where(mask).dropna()

Unnamed: 0_level_0,gre score,toefl score,university rating,statement of purpose,letter of recommendation,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337.0,118.0,4.0,4.5,4.5,9.65,1.0,0.92
6,330.0,115.0,5.0,4.5,3.0,9.34,1.0,0.90
13,328.0,112.0,4.0,4.0,4.5,9.10,1.0,0.78
23,328.0,116.0,5.0,5.0,5.0,9.50,1.0,0.94
24,334.0,119.0,5.0,5.0,4.5,9.70,1.0,0.95
...,...,...,...,...,...,...,...,...
395,329.0,111.0,4.0,4.5,4.0,9.23,1.0,0.89
396,324.0,110.0,3.0,3.5,3.5,9.04,1.0,0.82
397,325.0,107.0,3.0,3.0,3.5,9.11,1.0,0.84
398,330.0,116.0,4.0,5.0,4.5,9.45,1.0,0.91


In [20]:
# a simple way to do this, however, is to apply the mask directly as a search query in the slicing operator.
# This applies both the where and the dropna
df[mask]

Unnamed: 0_level_0,gre score,toefl score,university rating,statement of purpose,letter of recommendation,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
6,330,115,5,4.5,3.0,9.34,1,0.90
13,328,112,4,4.0,4.5,9.10,1,0.78
23,328,116,5,5.0,5.0,9.50,1,0.94
24,334,119,5,5.0,4.5,9.70,1,0.95
...,...,...,...,...,...,...,...,...
395,329,111,4,4.5,4.0,9.23,1,0.89
396,324,110,3,3.5,3.5,9.04,1,0.82
397,325,107,3,3.0,3.5,9.11,1,0.84
398,330,116,4,5.0,4.5,9.45,1,0.91


In [21]:
# to apply multiple criteria we can concatenate conditions
# & works as AND
# | works as OR
mask2 = (df["cgpa"] > 9) & (df["chance of admit"] > 0.8)
df[mask2].head()

Unnamed: 0_level_0,gre score,toefl score,university rating,statement of purpose,letter of recommendation,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
6,330,115,5,4.5,3.0,9.34,1,0.9
23,328,116,5,5.0,5.0,9.5,1,0.94
24,334,119,5,5.0,4.5,9.7,1,0.95
25,336,119,5,4.0,3.5,9.8,1,0.97


In [22]:
# another way to apply multiple criteria is using some built in methods
# gt = greather than ; lt = lower than
mask3 = df["cgpa"].gt(9) & df["chance of admit"].lt(0.99)
mask3.head(10)

Serial No.
1      True
2     False
3     False
4     False
5     False
6      True
7     False
8     False
9     False
10    False
dtype: bool

## Index and column manipulation

In [23]:
df.head()

Unnamed: 0_level_0,gre score,toefl score,university rating,statement of purpose,letter of recommendation,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [24]:
# We can move indexes back to  columns
df["Serial No."] = df.index
df.head()

Unnamed: 0_level_0,gre score,toefl score,university rating,statement of purpose,letter of recommendation,cgpa,research,chance of admit,Serial No.
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92,1
2,324,107,4,4.0,4.5,8.87,1,0.76,2
3,316,104,3,3.0,3.5,8.0,1,0.72,3
4,322,110,3,3.5,2.5,8.67,1,0.8,4
5,314,103,2,2.0,3.0,8.21,0,0.65,5


In [25]:
# then, we can also reset (delete the current and index and autogenerate one) This is normally NOT in place, but we can set it inplace
# this promotes the current index to a column
df.drop("Serial No.",inplace=True,axis=1)
df.reset_index(inplace=True)
df.head()

Unnamed: 0,Serial No.,gre score,toefl score,university rating,statement of purpose,letter of recommendation,cgpa,research,chance of admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [26]:
# we can also set a new column as index
df.set_index("gre score",inplace=True)
df.head()

Unnamed: 0_level_0,Serial No.,toefl score,university rating,statement of purpose,letter of recommendation,cgpa,research,chance of admit
gre score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
337,1,118,4,4.5,4.5,9.65,1,0.92
324,2,107,4,4.0,4.5,8.87,1,0.76
316,3,104,3,3.0,3.5,8.0,1,0.72
322,4,110,3,3.5,2.5,8.67,1,0.8
314,5,103,2,2.0,3.0,8.21,0,0.65


In [27]:
#and we can also set MULTILAYERED index with many columns
df.reset_index(inplace=True)
df.set_index(["gre score","toefl score"],inplace=True)
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Serial No.,university rating,statement of purpose,letter of recommendation,cgpa,research,chance of admit
gre score,toefl score,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
337,118,1,4,4.5,4.5,9.65,1,0.92
324,107,2,4,4.0,4.5,8.87,1,0.76
316,104,3,3,3.0,3.5,8.0,1,0.72
322,110,4,3,3.5,2.5,8.67,1,0.8
314,103,5,2,2.0,3.0,8.21,0,0.65


In [28]:
# we can query multilereyed indexes directly
df.loc[337,118]

  df.loc[337,118]


Unnamed: 0_level_0,Unnamed: 1_level_0,Serial No.,university rating,statement of purpose,letter of recommendation,cgpa,research,chance of admit
gre score,toefl score,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
337,118,1,4,4.5,4.5,9.65,1,0.92


In [29]:
# or via tuples and combine with columns query
tpl = (337,118)
df.loc[tpl,["cgpa","research"]]

  return self._getitem_tuple(key)


Unnamed: 0_level_0,Unnamed: 1_level_0,cgpa,research
gre score,toefl score,Unnamed: 2_level_1,Unnamed: 3_level_1
337,118,9.65,1


In [30]:
# We can also retrieve a column for further manipulation via the extract method of PANDAS series
# extract is a series method which means it must be applied in a single column
# the extract method is built into the str attribute of the series and ONLY allows for regex.
# we can also add titles inside the regex query! via ?P<TITLE>
# extract can only be used for strings!!

df = pd.DataFrame([{"nombre":"Vale",
         "clase":"Mate"},
         {"nombre":"Diego",
          "clase":"Idiomas"},
        {"nombre":"Ale",
          "clase":"Historia"}], index = ["record1","record2","record3"])
df

Unnamed: 0,nombre,clase
record1,Vale,Mate
record2,Diego,Idiomas
record3,Ale,Historia


In [31]:
#we need a regex pattern to use with extract
# the result is a new dataframe we can further manipulate
pattern = "(?P<new_col>.*)"

df["nombre"].str.extract(pattern)

Unnamed: 0,new_col
record1,Vale
record2,Diego
record3,Ale


## Replacing missing values / missing

In [38]:
df = pd.read_csv("datasets/class_grades.csv")
df.head(10)

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.5
1,8,95.05,105.49,67.5,99.07,68.33
2,8,83.7,83.17,,63.15,48.89
3,7,,,49.38,105.93,80.56
4,8,91.32,93.64,95.0,107.41,73.89
5,7,95.0,92.58,93.12,97.78,68.06
6,8,95.05,102.99,56.25,99.07,50.0
7,7,72.85,86.85,60.0,,56.11
8,8,84.26,93.1,47.5,18.52,50.83
9,7,90.1,97.55,51.25,88.89,63.61


In [49]:
# we can create around missing values using the  isnan() method
mask = df.isna()
mask.head()

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,True,False,False
3,False,True,True,False,False,False
4,False,False,False,False,False,False


In [51]:
# we can also drop any NaN values by using the dropna() method. This deletes all rows with a NA on them
# this method is NOT inplace unless specified
df.dropna().head(10)

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.5
1,8,95.05,105.49,67.5,99.07,68.33
4,8,91.32,93.64,95.0,107.41,73.89
5,7,95.0,92.58,93.12,97.78,68.06
6,8,95.05,102.99,56.25,99.07,50.0
8,8,84.26,93.1,47.5,18.52,50.83
9,7,90.1,97.55,51.25,88.89,63.61
10,7,80.44,90.2,75.0,91.48,39.72
12,8,97.16,103.71,72.5,93.52,63.33
13,7,91.28,83.53,81.25,99.81,92.22


In [52]:
# we can also FILL na values with a default value by using the fillna() method.
# the only argument we pass is the value with which to substitute the NaNs
df.fillna("EMPTY").head()

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.5
1,8,95.05,105.49,67.5,99.07,68.33
2,8,83.7,83.17,EMPTY,63.15,48.89
3,7,EMPTY,EMPTY,49.38,105.93,80.56
4,8,91.32,93.64,95.0,107.41,73.89


In [53]:
# we can also fill missing data by using the fillna methods ffill and bfill.
# ffill = forward fill, completes the missing value with the next available value
# bfill = backwards fill, the opposite of bfill
df.fillna(method="ffill").head()

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.5
1,8,95.05,105.49,67.5,99.07,68.33
2,8,83.7,83.17,67.5,63.15,48.89
3,7,83.7,83.17,49.38,105.93,80.56
4,8,91.32,93.64,95.0,107.41,73.89


In [54]:
df.fillna(method="bfill").head()

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.5
1,8,95.05,105.49,67.5,99.07,68.33
2,8,83.7,83.17,49.38,63.15,48.89
3,7,91.32,93.64,49.38,105.93,80.56
4,8,91.32,93.64,95.0,107.41,73.89


In [56]:
# we can use the drop method to eliminate records OR columns from a dataframe
# to delete a record we just use the reference index for the record
# this method is NOT in place unless specified
df.drop(3).head()

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.5
1,8,95.05,105.49,67.5,99.07,68.33
2,8,83.7,83.17,,63.15,48.89
4,8,91.32,93.64,95.0,107.41,73.89
5,7,95.0,92.58,93.12,97.78,68.06


In [58]:
# to delete multiple rows we pass in a LIST
lst = [3,5]
df.drop(tpl).head()

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.5
1,8,95.05,105.49,67.5,99.07,68.33
2,8,83.7,83.17,,63.15,48.89
4,8,91.32,93.64,95.0,107.41,73.89
6,8,95.05,102.99,56.25,99.07,50.0


In [59]:
# we can also use the drop method to delete columns
# to delete a column we must set the axis attribute to 1
df.drop("Final",axis=1).head()

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome
0,5,57.14,34.09,64.38,51.48
1,8,95.05,105.49,67.5,99.07
2,8,83.7,83.17,,63.15
3,7,,,49.38,105.93
4,8,91.32,93.64,95.0,107.41


In [65]:
df = pd.DataFrame({"a":[1,1,3,4,5],
                  "b":[6,7,8,9,10],
                  "c":["a","b","c","d","e"]},index=["rec1","rec2","rec3","rec4","rec5"])
df

Unnamed: 0,a,b,c
rec1,1,6,a
rec2,1,7,b
rec3,3,8,c
rec4,4,9,d
rec5,5,10,e


In [68]:
# we can replace values inside a dataframe using the replace method
# this method is NOT inplace unless specified
df.replace(1,100)

Unnamed: 0,a,b,c
rec1,100,6,a
rec2,100,7,b
rec3,3,8,c
rec4,4,9,d
rec5,5,10,e


In [69]:
# we can set values through a list too, ad the method will unpack them accordingly.
df.replace([1,3],[100,300])

Unnamed: 0,a,b,c
rec1,100,6,a
rec2,100,7,b
rec3,300,8,c
rec4,4,9,d
rec5,5,10,e


In [70]:
# replace also supports regex formatting
df.replace(to_replace="\d+",value=1,regex=True)

Unnamed: 0,a,b,c
rec1,1,6,a
rec2,1,7,b
rec3,3,8,c
rec4,4,9,d
rec5,5,10,e
