In [6]:
import pandas as pd
import numpy as np
import random
from string import ascii_uppercase

## ***Scales of Data***

***There are different scales invloved in categorical data:***        
***- Exam grades: Is the differences between different grades identical? i.e `|A+ - A| == |A - A-| == |A- - B+|......`***

1. Ratio scale: Measurement units are equally spaced. Mathematical operations are all valid. e.g. weight, height      
2. Interval scale: Measurement units are equally spaced but there is not a clear absence of value: a 0 here has a different meaning than simply marking an absence of value. e.g. temperature, compass reads.
3. Ordinal scale: Order of values is important where the differences between scales are not uniform. e.g. exam grades A+, A, A- ...........
4. nominal scale: Caregorical data where there are only a limited number of elements but their order is of no significance

In [10]:
grades = pd.DataFrame(["A+", "A", "A-", "B+", "B", "B-", "C+", "C", "C-", "D+", "D"],
            index = ["Best"] * 3 + ["Great"] * 3 + ["Okay"] * 3 + ["Meh"] * 2,
            columns = ["Grade"])

In [12]:
grades

Unnamed: 0,Grade
Best,A+
Best,A
Best,A-
Great,B+
Great,B
Great,B-
Okay,C+
Okay,C
Okay,C-
Meh,D+


In [14]:
grades.dtypes

Grade    object
dtype: object

In [17]:
# this call converts the object (string) data type into a categorical series, without an order relations
grades.Grade.astype("category")

Best     A+
Best      A
Best     A-
Great    B+
Great     B
Great    B-
Okay     C+
Okay      C
Okay     C-
Meh      D+
Meh       D
Name: Grade, dtype: category
Categories (11, object): ['A', 'A+', 'A-', 'B', ..., 'C+', 'C-', 'D', 'D+']

In [20]:
# to introduce ordering in a categirical series

ordered_cat = pd.CategoricalDtype(categories = reversed(["A+", "A", "A-", "B+", "B", "B-", "C+", "C", "C-", "D+", "D"]), ordered = True)
ordered_cat

CategoricalDtype(categories=['D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', 'B+', 'A-', 'A',
                  'A+'],
, ordered=True)

In [39]:
grades.Grade = grades.Grade.astype(ordered_cat)
grades.Grade.astype(ordered_cat)

Best     A+
Best      A
Best     A-
Great    B+
Great     B
Great    B-
Okay     C+
Okay      C
Okay     C-
Meh      D+
Meh       D
Name: Grade, dtype: category
Categories (11, object): ['D' < 'D+' < 'C-' < 'C' ... 'B+' < 'A-' < 'A' < 'A+']

In [40]:
grades.Grade.dtype

CategoricalDtype(categories=['D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', 'B+', 'A-', 'A',
                  'A+'],
, ordered=True)

In [41]:
# ordered categorical data can help in boolean masking & comparisons

grades.loc[grades.Grade > "B+"]

Unnamed: 0,Grade
Best,A+
Best,A
Best,A-


In [43]:
grades.loc[grades.Grade <= "C+"]

Unnamed: 0,Grade
Okay,C+
Okay,C
Okay,C-
Meh,D+
Meh,D


In [46]:
data = pd.read_csv(r"D:/Introduction-to-Data-Science-in-Python/week-3/datasets/census.csv")
data.tail()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
3188,50,4,8,56,37,Wyoming,Sweetwater County,43806,43806,43593,...,1.072643,16.243199,-5.339774,-14.252889,-14.248864,1.255221,16.243199,-5.29546,-14.075283,-14.070195
3189,50,4,8,56,39,Wyoming,Teton County,21294,21294,21297,...,-1.589565,0.972695,19.525929,14.143021,-0.564849,0.654527,2.408578,21.160658,16.308671,1.520747
3190,50,4,8,56,41,Wyoming,Uinta County,21118,21118,21102,...,-17.755986,-4.91635,-6.902954,-14.215862,-12.127022,-18.136812,-5.536861,-7.52184,-14.740608,-12.606351
3191,50,4,8,56,43,Wyoming,Washakie County,8533,8533,8545,...,-11.637475,-0.827815,-2.013502,-17.781491,1.682288,-11.990126,-1.182592,-2.250385,-18.020168,1.441961
3192,50,4,8,56,45,Wyoming,Weston County,7208,7208,7181,...,-11.752361,-8.040059,12.372583,1.533635,6.935294,-12.032179,-8.040059,12.372583,1.533635,6.935294


In [47]:
data = data.loc[data.SUMLEV == 50, :]
data.set_index("STNAME", inplace = True)

In [49]:
_data = data.groupby(level = 0).agg({"CENSUS2010POP": np.mean})
_data

Unnamed: 0_level_0,CENSUS2010POP
STNAME,Unnamed: 1_level_1
Alabama,71339.343284
Alaska,24490.724138
Arizona,426134.466667
Arkansas,38878.906667
California,642309.586207
Colorado,78581.1875
Connecticut,446762.125
Delaware,299311.333333
District of Columbia,601723.0
Florida,280616.567164


In [51]:
_data.ndim

2

In [52]:
pd.cut(_data.CENSUS2010POP, bins = 20)

STNAME
Alabama                   (43834.737, 75333.413]
Alaska                    (11706.087, 43834.737]
Arizona                 (421818.852, 453317.529]
Arkansas                  (11706.087, 43834.737]
California               (610810.91, 642309.586]
Colorado                 (75333.413, 106832.089]
Connecticut             (421818.852, 453317.529]
Delaware                (295824.147, 327322.823]
District of Columbia     (579312.234, 610810.91]
Florida                 (264325.471, 295824.147]
Georgia                   (43834.737, 75333.413]
Hawaii                  (264325.471, 295824.147]
Idaho                     (11706.087, 43834.737]
Illinois                (106832.089, 138330.766]
Indiana                   (43834.737, 75333.413]
Iowa                      (11706.087, 43834.737]
Kansas                    (11706.087, 43834.737]
Kentucky                  (11706.087, 43834.737]
Louisiana                 (43834.737, 75333.413]
Maine                    (75333.413, 106832.089]
Maryland     

### ***pd.cut gives equally spaced data while in certain instances one might prefer to have intervals with same number of elements :(***

In [4]:
grades = pd.CategoricalDtype(reversed("A,A+,A-,B+,B,B-,C+,C,C-,D+,D".split(",")), ordered = True)
grades

CategoricalDtype(categories=['D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', 'B+', 'A-', 'A+',
                  'A'],
, ordered=True)

In [7]:
"A,A+,A-,B+,B,B-,C+,C,C-,D+,D".split(",")

['A', 'A+', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D']

In [25]:
rand_grades = [grade for i in range(100) for grade in random.sample(['A', 'A+', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D'], k = 1)]
rand_grades[:10]

['C+', 'D+', 'D+', 'B', 'B', 'C', 'C-', 'A', 'A', 'C+']

In [28]:
grade_series = pd.Series(rand_grades, dtype = grades)

In [45]:
grade_series.loc[grade_series > "A-"]

7      A
8      A
13    A+
14    A+
15     A
18     A
23     A
31    A+
41    A+
44     A
56     A
64     A
65     A
69    A+
70    A+
74    A+
76    A+
93    A+
dtype: category
Categories (11, object): ['D' < 'D+' < 'C-' < 'C' ... 'B+' < 'A-' < 'A+' < 'A']