In [1]:
import pandas as pd
import numpy as np

# Scales

In [2]:
# We can set a nominal column as a categorical data type within pandas
# changing the underlying data type to categorica allows us for more flexible and intuitive operations
df = pd.DataFrame([{"grades":"A+"},
                   {"grades":"A"},
                   {"grades":"A-"},
                   {"grades":"B+"},
                   {"grades":"B"},
                   {"grades":"B-"},
                   {"grades":"C+"},
                   {"grades":"C"},
                   {"grades":"C-"},
                   {"grades":"D+"},
                   {"grades":"D"}], index=["excellent","excellent","excellent","good","good","good","ok","ok","ok","poor","poor"])
df

Unnamed: 0,grades
excellent,A+
excellent,A
excellent,A-
good,B+
good,B
good,B-
ok,C+
ok,C
ok,C-
poor,D+


In [3]:
# we can check the type the datatype of the data and see its only object
df.dtypes

grades    object
dtype: object

In [4]:
# we cam tell pandas to change the type of the column to category
df["grades"].astype("category")

excellent    A+
excellent     A
excellent    A-
good         B+
good          B
good         B-
ok           C+
ok            C
ok           C-
poor         D+
poor          D
Name: grades, dtype: category
Categories (11, object): ['A', 'A+', 'A-', 'B', ..., 'C+', 'C-', 'D', 'D+']

In [5]:
# we know have categories, but they are not ordered
# to order the categories we need to create a special data type call "CATegoricalData" and pass it as the ordered category
# this class takes in a LIST of ordered data in ASCENDING ORDER (low to high)
# the ordered flag signals that the data is in order
orderedcategories = pd.CategoricalDtype(categories=["D","D+","C-","C","C+","B-","B","B+","A-","A","A+"],ordered=True)
df["grades"] = df["grades"].astype(orderedcategories)
df["grades"]

excellent    A+
excellent     A
excellent    A-
good         B+
good          B
good         B-
ok           C+
ok            C
ok           C-
poor         D+
poor          D
Name: grades, dtype: category
Categories (11, object): ['D' < 'D+' < 'C-' < 'C' ... 'B+' < 'A-' < 'A' < 'A+']

In [6]:
df.dtypes

grades    category
dtype: object

In [7]:
# Pandas is now aware of the order of the categories
# because there is now an order, we canc reate new, smarter masks
df[df["grades"] > "B"]

Unnamed: 0,grades
excellent,A+
excellent,A
excellent,A-
good,B+


In [8]:
# pandas also allows us to create BINS (ranges)

df = pd.read_csv("datasets\census.csv")
df = df[df["SUMLEV"]==50]

# we retrieve only the relevant column from the DF so that we can do the next step
df.set_index("STNAME",inplace=True)
new_df = df.groupby(level=0)["CENSUS2010POP"].agg(np.average)
new_df.head()

STNAME
Alabama        71339.343284
Alaska         24490.724138
Arizona       426134.466667
Arkansas       38878.906667
California    642309.586207
Name: CENSUS2010POP, dtype: float64

In [9]:
# to create bins we use the upper level function CUT.
# we pass the dataframe and the number of bins we want
# the output here is a series
pd.cut(new_df,5)

STNAME
Alabama                  (11706.087, 138330.766]
Alaska                   (11706.087, 138330.766]
Arizona                 (390320.176, 516314.881]
Arkansas                 (11706.087, 138330.766]
California              (516314.881, 642309.586]
Colorado                 (11706.087, 138330.766]
Connecticut             (390320.176, 516314.881]
Delaware                (264325.471, 390320.176]
District of Columbia    (516314.881, 642309.586]
Florida                 (264325.471, 390320.176]
Georgia                  (11706.087, 138330.766]
Hawaii                  (264325.471, 390320.176]
Idaho                    (11706.087, 138330.766]
Illinois                 (11706.087, 138330.766]
Indiana                  (11706.087, 138330.766]
Iowa                     (11706.087, 138330.766]
Kansas                   (11706.087, 138330.766]
Kentucky                 (11706.087, 138330.766]
Louisiana                (11706.087, 138330.766]
Maine                    (11706.087, 138330.766]
Maryland     

In [10]:
# since the output is a series, we can assign a new column based on bins!
### NOTES -> to append the BINS i need to convert the reference DF into a DF again
### NOTES2 -> append must match the dimensions or we get some fuzzy weird errors
bins = pd.cut(new_df,5)
new_df = pd.DataFrame(new_df)

In [11]:
new_df["BINS"] = bins
new_df

Unnamed: 0_level_0,CENSUS2010POP,BINS
STNAME,Unnamed: 1_level_1,Unnamed: 2_level_1
Alabama,71339.343284,"(11706.087, 138330.766]"
Alaska,24490.724138,"(11706.087, 138330.766]"
Arizona,426134.466667,"(390320.176, 516314.881]"
Arkansas,38878.906667,"(11706.087, 138330.766]"
California,642309.586207,"(516314.881, 642309.586]"
Colorado,78581.1875,"(11706.087, 138330.766]"
Connecticut,446762.125,"(390320.176, 516314.881]"
Delaware,299311.333333,"(264325.471, 390320.176]"
District of Columbia,601723.0,"(516314.881, 642309.586]"
Florida,280616.567164,"(264325.471, 390320.176]"


# Pivots

In [22]:
df = pd.read_csv("datasets\cwurdata.csv")
df

Unnamed: 0,world_rank,institution,country,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,broad_impact,patents,score,year
0,1,Harvard University,USA,1,7,9,1,1,1,1,,5,100.00,2012
1,2,Massachusetts Institute of Technology,USA,2,9,17,3,12,4,4,,1,91.67,2012
2,3,Stanford University,USA,3,17,11,5,4,2,2,,15,89.50,2012
3,4,University of Cambridge,United Kingdom,1,10,24,4,16,16,11,,50,86.17,2012
4,5,California Institute of Technology,USA,4,2,29,7,37,22,22,,18,85.21,2012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2195,996,University of the Algarve,Portugal,7,367,567,218,926,845,812,969.0,816,44.03,2015
2196,997,Alexandria University,Egypt,4,236,566,218,997,908,645,981.0,871,44.03,2015
2197,998,Federal University of Ceará,Brazil,18,367,549,218,830,823,812,975.0,824,44.03,2015
2198,999,University of A Coruña,Spain,40,367,567,218,886,974,812,975.0,651,44.02,2015


In [24]:
# In here we want to create a column that categorizes universities from 1-100 to first tier, 101-200 to seconds tier, 201-300 to third tier and the rest as other
def new_col(data):
    if data < 101:
        return "First Tier"
    elif data < 201:
        return "Second Tier"
    elif data < 301:
        return "Third Tier"
    else:
        return "Other Tier"

# This applies the function to every value of the RANK column
df["tier"] = df["world_rank"].apply(new_col)
df

Unnamed: 0,world_rank,institution,country,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,broad_impact,patents,score,year,tier
0,1,Harvard University,USA,1,7,9,1,1,1,1,,5,100.00,2012,First Tier
1,2,Massachusetts Institute of Technology,USA,2,9,17,3,12,4,4,,1,91.67,2012,First Tier
2,3,Stanford University,USA,3,17,11,5,4,2,2,,15,89.50,2012,First Tier
3,4,University of Cambridge,United Kingdom,1,10,24,4,16,16,11,,50,86.17,2012,First Tier
4,5,California Institute of Technology,USA,4,2,29,7,37,22,22,,18,85.21,2012,First Tier
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2195,996,University of the Algarve,Portugal,7,367,567,218,926,845,812,969.0,816,44.03,2015,Other Tier
2196,997,Alexandria University,Egypt,4,236,566,218,997,908,645,981.0,871,44.03,2015,Other Tier
2197,998,Federal University of Ceará,Brazil,18,367,549,218,830,823,812,975.0,824,44.03,2015,Other Tier
2198,999,University of A Coruña,Spain,40,367,567,218,886,974,812,975.0,651,44.02,2015,Other Tier


In [31]:
# Pandas pivot table generator creates a summary dataframe based on several paramaters
## values is the value to summarize
## index is the rows
## columns is the guiding columns
## aggfunc is the LIST of aggregations to apply

#this pivot table returns THE MIN, MAX, AVG SCORE, per TIER, PER country
df.pivot_table(values="score", index="country", columns="tier", aggfunc=[np.max, np.average, np.min]).head(5)

Unnamed: 0_level_0,amax,amax,amax,amax,average,average,average,average,amin,amin,amin,amin
tier,First Tier,Other Tier,Second Tier,Third Tier,First Tier,Other Tier,Second Tier,Third Tier,First Tier,Other Tier,Second Tier,Third Tier
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Argentina,,45.66,,,,44.672857,,,,44.1,,
Australia,51.61,45.97,50.4,47.47,47.9425,44.64575,49.2425,47.285,44.13,44.09,47.97,47.1
Austria,,46.29,,47.78,,44.864286,,47.066667,,44.19,,46.39
Belgium,52.03,46.21,49.73,47.14,51.875,45.081,49.084,46.746667,51.72,44.31,48.08,46.21
Brazil,,46.08,49.82,,,44.499706,49.565,,,44.03,49.31,


In [32]:
# we can also enable sidebar margins
df.pivot_table(values="score", index="country", columns="tier", aggfunc=[np.max, np.average, np.min], margins=True).head(5)

Unnamed: 0_level_0,amax,amax,amax,amax,amax,average,average,average,average,average,amin,amin,amin,amin,amin
tier,First Tier,Other Tier,Second Tier,Third Tier,All,First Tier,Other Tier,Second Tier,Third Tier,All,First Tier,Other Tier,Second Tier,Third Tier,All
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
Argentina,,45.66,,,45.66,,44.672857,,,44.672857,,44.1,,,44.1
Australia,51.61,45.97,50.4,47.47,51.61,47.9425,44.64575,49.2425,47.285,45.825517,44.13,44.09,47.97,47.1,44.09
Austria,,46.29,,47.78,47.78,,44.864286,,47.066667,45.139583,,44.19,,46.39,44.19
Belgium,52.03,46.21,49.73,47.14,52.03,51.875,45.081,49.084,46.746667,47.011,51.72,44.31,48.08,46.21,44.31
Brazil,,46.08,49.82,,49.82,,44.499706,49.565,,44.781111,,44.03,49.31,,44.03


In [33]:
# we can use the IDXMAX SERIES-- function to find the country which has the higuest value for a column
# below, we find the country the higuest value for FIRST TIER AVERAGE
new_df = df.pivot_table(values="score", index="country", columns="tier", aggfunc=[np.max, np.average, np.min], margins=True)
new_df["average"]["First Tier"].idxmax()

'United Kingdom'

In [51]:
# we can also stack or unstack the pivots, whcih basically MOVE  columns into rows and viceversa
# stacking moves the INNERMOST COLUMN into INNERMOST ROW
new_df2 = new_df.stack()
new_df2

Unnamed: 0_level_0,Unnamed: 1_level_0,amax,average,amin
country,tier,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Argentina,Other Tier,45.66,44.672857,44.10
Argentina,All,45.66,44.672857,44.10
Australia,First Tier,51.61,47.942500,44.13
Australia,Other Tier,45.97,44.645750,44.09
Australia,Second Tier,50.40,49.242500,47.97
...,...,...,...,...
All,First Tier,100.00,58.350675,43.36
All,Other Tier,46.34,44.738871,44.02
All,Second Tier,51.29,49.065450,47.49
All,Third Tier,47.93,46.843450,45.95
