In [1]:
!pip3 install pandas scipy seaborn xlrd

Collecting seaborn
  Downloading seaborn-0.11.1-py3-none-any.whl (285 kB)
[K     |████████████████████████████████| 285 kB 21.7 MB/s eta 0:00:01
[?25hCollecting xlrd
  Downloading xlrd-2.0.1-py2.py3-none-any.whl (96 kB)
[K     |████████████████████████████████| 96 kB 8.6 MB/s  eta 0:00:01
Collecting matplotlib>=2.2
  Downloading matplotlib-3.4.2-cp38-cp38-manylinux1_x86_64.whl (10.3 MB)
[K     |████████████████████████████████| 10.3 MB 40.4 MB/s eta 0:00:01
[?25hCollecting kiwisolver>=1.0.1
  Downloading kiwisolver-1.3.1-cp38-cp38-manylinux1_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 116.9 MB/s eta 0:00:01
[?25hCollecting cycler>=0.10
  Using cached cycler-0.10.0-py2.py3-none-any.whl (6.5 kB)
Collecting pillow>=6.2.0
  Downloading Pillow-8.3.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (3.0 MB)
[K     |████████████████████████████████| 3.0 MB 106.8 MB/s eta 0:00:01
[?25hInstalling collected packages: pillow, kiwisolver, cycler, matplotlib, x

In [2]:
import pandas as pd
import seaborn as sns
import scipy.stats
import re

In [3]:
url="https://cssbook.net/d/guns-polls.csv"
d=pd.read_csv(url)
d = d.rename(columns={"Republican Support": "rep",
    "Democratic Support": "dem"})
d = d.drop(columns="URL")   
# alternatively, we can write:
# d.drop(columns="URL", inplace=True)   
d2 = d.loc[d.Question == "arm-teachers"]
d2

Unnamed: 0,Question,Start,End,Pollster,Population,Support,rep,dem
7,arm-teachers,2/23/18,2/25/18,YouGov/Huffpost,Registered Voters,41,69,20
8,arm-teachers,2/20/18,2/23/18,CBS News,Adults,44,68,20
9,arm-teachers,2/27/18,2/28/18,Rasmussen,Adults,43,71,24
10,arm-teachers,2/27/18,2/28/18,NPR/Ipsos,Adults,41,68,18
11,arm-teachers,3/3/18,3/5/18,Quinnipiac,Registered Voters,40,77,10
12,arm-teachers,2/26/18,2/28/18,SurveyMonkey,Registered Voters,43,80,11


In [4]:
# version of the guns polls with some errors 
url="https://cssbook.net/d/guns-polls-dirty.csv"
d2=pd.read_csv(url)

# Option 1: clean with direct assignment
# Note that when creating a new column, 
# you have to use df["col"] rather than df.col 
d2["rep2"] = d2.rep.str.replace("[^0-9\\.]", "")
d2["rep2"] = pd.to_numeric(d2.rep2)
d2["Support2"]=d2.Support.fillna(d.Support.mean())

# Alternatively, clean with apply 
# Note the need to use an anonymous function
# (lambda) to chain calculations
cleaned = d2.assign(
    rep2 = d2.rep.str.replace("[^0-9\\.]", ""),
    rep3 = lambda d2: pd.to_numeric(d2.rep2),
    Support2=d2.Support.fillna(d2.Support.mean()))

# Finally, you can create your own function 
def clean_num(x):
    x = re.sub("[^0-9\\.]", "", x)
    return int(x)

cleaned["rep3"] = cleaned.rep.apply(clean_num)
cleaned.head()

  d2["rep2"] = d2.rep.str.replace("[^0-9\\.]", "")
  rep2 = d2.rep.str.replace("[^0-9\\.]", ""),


Unnamed: 0,Question,Start,End,Pollster,Population,Support,rep,dem,rep2,Support2,rep3
0,arm-teachers,2/23/18,2/25/18,YouGov/Huffpost,Registered Voters,41.0,69,20,69,41.0,69
1,arm-teachers,2/20/18,2/23/18,CBS News,Adults,,68,20,68,41.6,68
2,arm-teachers,2/27/18,2/28/18,Rasmussen,Adults,43.0,71d,24,71,43.0,71
3,arm-teachers,2/27/18,2/28/18,NPR/Ipsos,Adults,41.0,68,18,68,41.0,68
4,arm-teachers,3/3/18,3/5/18,Quinnipiac,Registered Voters,40.0,77,10,77,40.0,77


In [5]:
groups = d.groupby("Question")
groups.agg({"Support": ["mean", "std"]})

Unnamed: 0_level_0,Support,Support
Unnamed: 0_level_1,mean,std
Question,Unnamed: 1_level_2,Unnamed: 2_level_2
age-21,75.857143,6.011893
arm-teachers,42.0,1.549193
background-checks,87.428571,7.322503
ban-assault-weapons,61.75,6.440285
ban-high-capacity-magazines,67.285714,3.860669
mental-health-own-gun,85.833333,5.455884
repeal-2nd-amendment,10.0,
stricter-gun-laws,66.454545,5.145165


In [6]:
d.groupby("Question").agg(
    {"Support": ["mean", "std"]})

Unnamed: 0_level_0,Support,Support
Unnamed: 0_level_1,mean,std
Question,Unnamed: 1_level_2,Unnamed: 2_level_2
age-21,75.857143,6.011893
arm-teachers,42.0,1.549193
background-checks,87.428571,7.322503
ban-assault-weapons,61.75,6.440285
ban-high-capacity-magazines,67.285714,3.860669
mental-health-own-gun,85.833333,5.455884
repeal-2nd-amendment,10.0,
stricter-gun-laws,66.454545,5.145165


In [7]:
# Note the use of ( ) to split a long line
d["mean"] = (d.groupby("Question")["Support"]
              .transform("mean"))
d["deviation"] = d["Support"] - d["mean"]
d.head()

Unnamed: 0,Question,Start,End,Pollster,Population,Support,rep,dem,mean,deviation
0,age-21,2/20/18,2/23/18,CNN/SSRS,Registered Voters,72,61,86,75.857143,-3.857143
1,age-21,2/27/18,2/28/18,NPR/Ipsos,Adults,82,72,92,75.857143,6.142857
2,age-21,3/1/18,3/4/18,Rasmussen,Adults,67,59,76,75.857143,-8.857143
3,age-21,2/22/18,2/26/18,Harris Interactive,Registered Voters,84,77,92,75.857143,8.142857
4,age-21,3/3/18,3/5/18,Quinnipiac,Registered Voters,78,63,93,75.857143,2.142857
