# Pandas DataFrames

## creating dataframes

In [1]:
# Dataframes are two dimensional objects with labeled rows and columns.
# Dataframes can be considered a group of labeled lists.
import pandas as pd

In [2]:
# dataframes can be created in many ways: from pandas series, list of dictionaries or imports.

In [3]:
# Creating a dataframe from Pandas.Series. Each series is a ROW of information for the dataframe.

#series from a list and direct index
series1 = pd.Series(["Vale","Mate"],index=["nombre","clase"])
#series from a dictionary
series2 = pd.Series({"nombre":"Diego","clase":"idiomas"})

# creating dataframe from the series

df = pd.DataFrame([series1,series2])
df

Unnamed: 0,nombre,clase
0,Vale,Mate
1,Diego,idiomas


In [6]:
# Creating a dataframe from a LIST of DICTIONARIES
# We can set the indeces manually. almost like n series.

dicts = [{"nombre":"Vale",
         "clase":"Mate"},
         {"nombre":"Diego",
          "clase":"Idiomas"},
        {"nombre":"Ale",
          "clase":"Historia"}]

df = pd.DataFrame(dicts, index=["record1","record2","record1"])
df

Unnamed: 0,nombre,clase
record1,Vale,Mate
record2,Diego,Idiomas
record1,Ale,Historia


In [7]:
# A dataframe can be transposed by using the .T attribute, but this is NOT in place.
df.T

Unnamed: 0,record1,record2,record1.1
nombre,Vale,Diego,Ale
clase,Mate,Idiomas,Historia


## Querying dataframes

In [8]:
# To retrieve a record [ROW] we must use the LOC attribute (NOT A METHOD)
# if there are many ROWS with the same name, the result is a dataframe
df.loc["record1"]

Unnamed: 0,nombre,clase
record1,Vale,Mate
record1,Ale,Historia


In [9]:
# to retrieve a column we use the slicing operator with the column name. if we use slicing with a ROW we get an ERROR.
# this returns a SERIES
df["nombre"]

record1     Vale
record2    Diego
record1      Ale
Name: nombre, dtype: object

In [13]:
# we can also combine operators to retrieve rows AND columns.
# if we want to retrieve many columns we can pass a list of columns.
df.loc["record1",["nombre","clase"]]

Unnamed: 0,nombre,clase
record1,Vale,Mate
record1,Ale,Historia


In [17]:
# another way to chain is
df.loc["record1"][["nombre","clase"]]

Unnamed: 0,nombre,clase
record1,Vale,Mate
record1,Ale,Historia


In [19]:
# The prefered method is to use .loc since it also supports slicing
# This returns all the ROWS along with the nombre column
df.loc[:,["nombre"]]

Unnamed: 0,nombre
record1,Vale
record2,Diego
record1,Ale


## Opening files via csv reader

In [26]:
# loading a csv file to create a dataframe
# .head() method shows the first 5 rows.

df = pd.read_csv("datasets/Admission_Predict.csv")
df.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [28]:
# we can set a default column as index by using the index_col argument
# this promotes serial to INDEX
df = pd.read_csv("datasets/Admission_Predict.csv",index_col=0)
df.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [29]:
# we can retrieve the columns as a LIST by using the column attribute
df.columns

Index(['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR ', 'CGPA',
       'Research', 'Chance of Admit '],
      dtype='object')

In [30]:
# we can also retrieve the index(es) via the .index attribute
df.index

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            391, 392, 393, 394, 395, 396, 397, 398, 399, 400],
           dtype='int64', name='Serial No.', length=400)

In [31]:
# a common operation is to clean column names.
# we can do this by passing the new columns as a LIST directly into the attribute
cols = df.columns
new_cols = [col.strip().lower() for col in cols]
df.columns = new_cols
df.head()

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [32]:
# we can also use the rename method
# this method takes a dictionary as an argument with the key being the old column and the value the new one
# for it to modify INPLACE we must set the argument to true
df.rename(columns={"sop":"statement of purpose","lor":"letter of recommendation"},inplace=True)
df.head()

Unnamed: 0_level_0,gre score,toefl score,university rating,statement of purpose,letter of recommendation,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


## boolean masking