# Imports

In [1]:
# To run all the expressions in each cell. 
# To return to the default behavior, write: InteractiveShell.ast_node_interactivity = "last_expr"

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import pandas as pd
import numpy as np

# Acquisition and first inspection of the main database

Let's import the database, preview its first few rows and inspect the columns.

In [45]:
df = pd.read_csv('Deaths_db.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 346342 entries, 0 to 346341
Data columns (total 13 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   Year                            346342 non-null  int64  
 1   WEEK                            346342 non-null  int64  
 2   Week Ending Date                346342 non-null  object 
 3   REGION                          346342 non-null  int64  
 4   State                           340300 non-null  object 
 5   City                            340300 non-null  object 
 6   Pneumonia and Influenza Deaths  343058 non-null  float64
 7   All Deaths                      344871 non-null  float64
 8   <1 year (all cause deaths)      338953 non-null  float64
 9   1-24 years (all cause deaths)   339374 non-null  float64
 10  25-44 years                     342711 non-null  float64
 11  45-64 years (all cause deaths)  344662 non-null  float64
 12  65+ years (all c

*The columns of the database are named inconsistently; also, very long names makes it boring to refer to. 
RENAME them.*

*Moreover, change the data type of entries in a bunch of columns. I prefer to have the first six columns object (i.e. strings).*

In [46]:
dict_columns = {'Year':'year', 'WEEK':'week', 'Week Ending Date':'week_end', 'REGION':'region', 'State':'state', 
                'City':'city', 'Pneumonia and Influenza Deaths':'PI_deaths', 'All Deaths':'all_deaths', 
                '<1 year (all cause deaths)':'0_deaths', '1-24 years (all cause deaths)':'1-24_deaths', 
                '25-44 years':'25-44_deaths', '45-64 years (all cause deaths)':'45-64_deaths', 
                '65+ years (all cause deaths)':'65+_deaths'};
df.rename(columns=dict_columns, inplace=True)

df[['year', 'week', 'region']] = df[['year', 'week', 'region']].astype('object')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 346342 entries, 0 to 346341
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   year          346342 non-null  object 
 1   week          346342 non-null  object 
 2   week_end      346342 non-null  object 
 3   region        346342 non-null  object 
 4   state         340300 non-null  object 
 5   city          340300 non-null  object 
 6   PI_deaths     343058 non-null  float64
 7   all_deaths    344871 non-null  float64
 8   0_deaths      338953 non-null  float64
 9   1-24_deaths   339374 non-null  float64
 10  25-44_deaths  342711 non-null  float64
 11  45-64_deaths  344662 non-null  float64
 12  65+_deaths    344845 non-null  float64
dtypes: float64(7), object(6)
memory usage: 34.4+ MB


There are 6042 rows of the DataFrame with a NaN in either the state or the city column (in fact both, as shown below). Let's create a new DataFrame only including those data. When mortality data are present (i.e. it isn't just a row of NaN's), we might be able to guess a posteriori what city do those data belong to.

In [47]:
df_nocity = df[df.city.isna()]

df_nocity.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6042 entries, 96790 to 302205
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   year          6042 non-null   object 
 1   week          6042 non-null   object 
 2   week_end      6042 non-null   object 
 3   region        6042 non-null   object 
 4   state         0 non-null      object 
 5   city          0 non-null      object 
 6   PI_deaths     5982 non-null   float64
 7   all_deaths    5982 non-null   float64
 8   0_deaths      5982 non-null   float64
 9   1-24_deaths   5982 non-null   float64
 10  25-44_deaths  5982 non-null   float64
 11  45-64_deaths  5982 non-null   float64
 12  65+_deaths    5982 non-null   float64
dtypes: float64(7), object(6)
memory usage: 660.8+ KB


*Now restrict the main DataFrame to the rows with non-missing values of 'state' and 'city'.*

In [48]:
df = df[df.city.notna()]

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 340300 entries, 0 to 346341
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   year          340300 non-null  object 
 1   week          340300 non-null  object 
 2   week_end      340300 non-null  object 
 3   region        340300 non-null  object 
 4   state         340300 non-null  object 
 5   city          340300 non-null  object 
 6   PI_deaths     337076 non-null  float64
 7   all_deaths    338889 non-null  float64
 8   0_deaths      332971 non-null  float64
 9   1-24_deaths   333392 non-null  float64
 10  25-44_deaths  336729 non-null  float64
 11  45-64_deaths  338680 non-null  float64
 12  65+_deaths    338863 non-null  float64
dtypes: float64(7), object(6)
memory usage: 36.3+ MB


Ok, now we are left with a DataFrame having missing values only in the numeric columns. We will investigate it further in what follows.

### A consistency check

Consistency check: is the all_deaths column the sum of the 0_deaths, ..., 65+_deaths?

In [71]:
df['check'] = df['all_deaths'] - df[['0_deaths', '1-24_deaths', '25-44_deaths', '45-64_deaths', '65+_deaths']].sum(axis=1, skipna=False)

checksum = pd.Series(df.check.value_counts(dropna=False))

The series indices here are always integers: I would like them to be converted to int and then to strings. Hence the following function.

In [72]:
def make_index_str(df):
    index_dict = {}
    for c in df.index:
        try:
            index_dict[c] = str(int(c))
        except ValueError:
            index_dict[c] = 'NaN'
    df = df.rename(index_dict)
    return df

checksum = make_index_str(checksum)
checksum.index
checksum

Index([  '0',   nan,   '1',   '2',   '3',   '4',   '5',   '6',   '7',   '8',
         '9',  '10',  '11',  '12',  '13',  '14',  '15',  '17',  '18',  '16',
        '22',  '20',  '21',  '19',  '30',  '29',  '26',  '25',  '28',  '24',
        '32',  '35',  '27',  '87',  '33',  '65',  '71',  '38',  '44',  '48',
        '37',  '31',  '52',  '58',  '39',  '61',  '41',  '85',  '90',  '46',
        '47',  '95',  '59', '110', '128', '100',  '72', '114',  '70'],
      dtype='object')

0      322927
NaN      9348
1        5095
2        1222
3         536
4         251
5         162
6         134
7          94
8          83
9          70
10         59
11         43
12         40
13         36
14         29
15         26
17         19
18         14
16         13
22         11
20         10
21          8
19          7
30          6
29          5
26          4
25          4
28          4
24          4
32          3
35          2
27          2
87          2
33          2
65          2
71          1
38          1
44          1
48          1
37          1
31          1
52          1
58          1
39          1
61          1
41          1
85          1
90          1
46          1
47          1
95          1
59          1
110         1
128         1
100         1
72          1
114         1
70          1
Name: check, dtype: int64

Cool. It looks like - but for a few cases where relatively large discrepancies are present, to be investigated further - the numbers add up quite alright. Of course the elephant in the room is the non-negligible amount of NaN's.

To get a better idea of where the problem might lie, let's split the database into several ones, by year. We'll create a dictionary of DataFrames.

# Splitting the main database by year/city

This is useful to have smaller, more manageable DataFrames to manipulate; moreover, a natural thing to do will be to compute statistics of interest by year, as well as time series from databases pertaining to single cities.

We'll write functions generating dictionaries of databases, splitting the main one across different values found in a given column.

In [76]:
# df['year'].unique()
# df['city'].unique()

In [90]:
def split_by(df, col):
    dictionary = {}
    col_setofvalues = df[col].unique()
    for val in col_setofvalues:
        dictionary[val] = df[df[col] == val]
    return dictionary

dict_year = split_by(df, 'year')
dict_city = split_by(df, 'city')

In [94]:
dict_city['Chicago']

Unnamed: 0,year,week,week_end,region,state,city,PI_deaths,all_deaths,0_deaths,1-24_deaths,25-44_deaths,45-64_deaths,65+_deaths,check
104852,1962,1,01/06/1962,3,IL,Chicago,51.0,859.0,62.0,29.0,62.0,266.0,440.0,0.0
104853,1962,2,01/13/1962,3,IL,Chicago,40.0,839.0,48.0,22.0,59.0,225.0,485.0,0.0
104854,1962,3,01/20/1962,3,IL,Chicago,55.0,894.0,58.0,23.0,74.0,260.0,479.0,0.0
104855,1962,4,01/27/1962,3,IL,Chicago,45.0,877.0,54.0,30.0,69.0,247.0,477.0,0.0
104856,1962,5,02/03/1962,3,IL,Chicago,45.0,847.0,49.0,29.0,52.0,241.0,476.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107683,2016,35,09/03/2016,3,IL,Chicago,12.0,251.0,3.0,10.0,18.0,78.0,142.0,0.0
107684,2016,36,09/10/2016,3,IL,Chicago,18.0,335.0,4.0,13.0,38.0,95.0,185.0,0.0
107685,2016,37,09/17/2016,3,IL,Chicago,18.0,282.0,3.0,11.0,24.0,59.0,185.0,0.0
107686,2016,38,09/24/2016,3,IL,Chicago,21.0,255.0,3.0,7.0,21.0,64.0,160.0,0.0


In [113]:
dict_year = {}
for y in np.arange(1962, 2017):
    dict_year[y] = df[df.year == y]

In [114]:
dy = dict_year[1996]

dy[dy.isnull().any(axis=1)].head(20)
dy[dy.check == 104]

Unnamed: 0,year,week,week_end,region,state,city,PI_deaths,all_deaths,0_deaths,1-24_deaths,25-44_deaths,45-64_deaths,65+_deaths,check
18975,1996,43,10/26/1996,1,MA,Lynn,,,,,,,,
18976,1996,44,11/02/1996,1,MA,Lynn,,,,,,,,
44641,1996,39,09/28/1996,2,PA,Allentown,,,,,,,,
44642,1996,40,10/05/1996,2,PA,Allentown,,,,,,,,
47498,1996,50,12/14/1996,2,NY,Buffalo,,,,,,,,
99086,1996,1,01/06/1996,3,,,9.0,62.0,0.0,0.0,6.0,12.0,44.0,0.0
99087,1996,2,01/13/1996,3,,,7.0,69.0,1.0,2.0,9.0,14.0,43.0,0.0
99088,1996,3,01/20/1996,3,,,6.0,68.0,1.0,1.0,9.0,14.0,43.0,0.0
99089,1996,4,01/27/1996,3,,,4.0,52.0,2.0,0.0,6.0,11.0,33.0,0.0
99090,1996,5,02/03/1996,3,,,7.0,70.0,1.0,2.0,4.0,11.0,52.0,0.0


Unnamed: 0,year,week,week_end,region,state,city,PI_deaths,all_deaths,0_deaths,1-24_deaths,25-44_deaths,45-64_deaths,65+_deaths,check


We can also split the main DataFrames by city, to be able to plot time series for each of them.

In [132]:
df.city.unique()
df.city.unique()[0]

array(['Boston', 'Hartford', 'Bridgeport', 'Cambridge', 'Somerville',
       'Allentown', 'Fall River', 'Camden', 'Yonkers', 'Lowell', 'Gary',
       'South Bend', 'Youngstown', 'Des Moines', 'Kansas City', 'Lincoln',
       'Lynn', 'New Bedford', 'New Haven', 'Providence', 'Springfield',
       'Waterbury', 'Worcester', 'Albany', 'Buffalo', 'Elizabeth', 'Erie',
       'Jersey City', 'New York', 'Newark', 'Paterson', 'Philadelphia',
       'Pittsburgh', 'Reading', 'Rochester', 'Schenectady', 'Scranton',
       'Syracuse', 'Trenton', 'Utica', nan, 'Akron', 'Canton', 'Chicago',
       'Cincinnati', 'Cleveland', 'Columbus', 'Dayton', 'Detroit',
       'Evansville', 'Fort Wayne', 'Grand Rapids', 'Indianapolis',
       'Lansing', 'Milwaukee', 'Peoria', 'Rockford', 'Toledo', 'Duluth',
       'Minneapolis', 'Omaha', 'Saint Louis', 'Saint Paul', 'Wichita',
       'Atlanta', 'Wilimington', 'Baltimore', 'Charlotte', 'Jacksonville',
       'Birmingham', 'Baton Rouge', 'Corpus Christi', 'New Orlea

'Boston'

In [118]:
dict_city = {}
for c in df.city.unique():
    dict_city[c] = df[df.city == c]

In [151]:
dc = dict_city[df.city.unique()[3]]

dc.info()
dc[dc.isnull().any(axis=1)].head(50)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2857 entries, 5711 to 8575
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   year          2857 non-null   int64  
 1   week          2857 non-null   int64  
 2   week_end      2857 non-null   object 
 3   region        2857 non-null   int64  
 4   state         2857 non-null   object 
 5   city          2857 non-null   object 
 6   PI_deaths     2831 non-null   float64
 7   all_deaths    2855 non-null   float64
 8   0_deaths      2738 non-null   float64
 9   1-24_deaths   2739 non-null   float64
 10  25-44_deaths  2773 non-null   float64
 11  45-64_deaths  2843 non-null   float64
 12  65+_deaths    2855 non-null   float64
 13  check         2729 non-null   float64
dtypes: float64(8), int64(3), object(3)
memory usage: 334.8+ KB


Unnamed: 0,year,week,week_end,region,state,city,PI_deaths,all_deaths,0_deaths,1-24_deaths,25-44_deaths,45-64_deaths,65+_deaths,check
7702,2000,3,01/22/2000,1,MA,Cambridge,,,,,,,,
7703,2000,4,01/29/2000,1,MA,Cambridge,,,,,,,,
8156,2008,40,10/04/2008,1,MA,Cambridge,3.0,18.0,,,2.0,5.0,11.0,
8157,2008,41,10/11/2008,1,MA,Cambridge,1.0,14.0,,,1.0,3.0,10.0,
8158,2008,42,10/18/2008,1,MA,Cambridge,3.0,16.0,,,2.0,5.0,9.0,
8159,2008,43,10/25/2008,1,MA,Cambridge,3.0,17.0,,,,2.0,15.0,
8160,2008,44,11/01/2008,1,MA,Cambridge,2.0,18.0,,1.0,1.0,1.0,15.0,
8161,2008,45,11/08/2008,1,MA,Cambridge,3.0,14.0,,,,3.0,11.0,
8162,2008,46,11/15/2008,1,MA,Cambridge,,19.0,,,,4.0,15.0,
8163,2008,47,11/22/2008,1,MA,Cambridge,1.0,8.0,,,,1.0,7.0,


Out of the rows with missing values, assuming that the total number of deaths is reliable,
* a good number of them just has a single NaN, which can then be inferred so that the total number of deaths is achieved;
* some other rows lack more than one value, but the numbers of deaths per age group that appear already sum up to the total number of deaths, hence the NaN's can be replaced with 0s;
* some other rows have multiple NaN's, which cannot be unambiguously determined: one strategy would be to replace all of them but one with the mean value in that city over the last n months/years and the final with what's required to achieve the total number of deaths (do this on the rightmost NaN, as it's likely to have a higher number of deaths among older sectors of the population and this can better accomodate a fluctuation that might otherwise look weird in the first columns -- if in a particular week, say, 5 toddlers died, when the average number is 0.8);
* finally there are rows with all NaN's, which are useless and can be discarded: from the looks of it I won't be losing more than a few data points (out of 52) per year, which sounds reasonable.

In [149]:
157 - np.array([5, np.nan, 7, 40, 105]).sum()
157 - 5 - np.nan - 7 - 40 - 105

nan

nan

In [39]:
df['Year'].unique()
df['WEEK'].unique()
df['Week Ending Date'].unique()
df['REGION'].unique()
df['State'].unique()
df['City'].unique()

df['Pneumonia and Influenza Deaths'].unique()
df['All Deaths'].unique()
df['<1 year (all cause deaths)'].unique()
df['1-24 years (all cause deaths)'].unique()
df['25-44 years'].unique()
df['45-64 years (all cause deaths)'].unique()
df['65+ years (all cause deaths)'].unique()

1969    6519
1964    6466
1997    6466
2003    6466
2008    6429
1986    6414
1980    6413
1975    6413
1992    6413
1971    6396
1970    6396
1968    6396
2001    6344
1998    6344
2000    6344
1962    6344
2002    6344
1967    6344
1963    6344
1965    6344
1966    6344
1999    6344
2007    6344
2006    6344
2005    6344
2004    6344
2014    6343
1972    6313
1981    6292
1973    6292
1974    6292
1976    6292
1977    6292
1978    6292
1979    6292
1989    6292
1982    6292
1990    6292
1996    6292
1995    6292
1994    6292
1983    6292
1991    6292
1993    6292
1988    6292
1987    6292
1985    6292
1984    6292
2013    6232
2011    6228
2010    6220
2012    6204
2015    6191
2009    6186
2016    4582
Name: Year, dtype: int64

array([1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972,
       1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983,
       1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994,
       1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005,
       2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016])

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53])

array(['01/06/1962', '01/13/1962', '01/20/1962', ..., '10/01/2016',
       '03/19/2016', '10/31/1986'], dtype=object)

array([1, 2, 3, 4, 5, 6, 7, 9, 8])

array(['MA', 'CT', 'PA', 'NJ', 'NY', 'IN', 'OH', 'IA', 'KS', 'NE', 'RI',
       nan, 'IL', 'MI', 'WI', 'MN', 'MO', 'GA', 'DE', 'MD', 'NC', 'FL',
       'AL', 'LA', 'TX', 'VA', 'DC', 'TN', 'KY', 'AR', 'OK', 'CA', 'NM',
       'ID', 'CO', 'NV', 'UT', 'AZ', 'HI', 'OR', 'WA'], dtype=object)

array(['Boston', 'Hartford', 'Bridgeport', 'Cambridge', 'Somerville',
       'Allentown', 'Fall River', 'Camden', 'Yonkers', 'Lowell', 'Gary',
       'South Bend', 'Youngstown', 'Des Moines', 'Kansas City', 'Lincoln',
       'Lynn', 'New Bedford', 'New Haven', 'Providence', 'Springfield',
       'Waterbury', 'Worcester', 'Albany', 'Buffalo', 'Elizabeth', 'Erie',
       'Jersey City', 'New York', 'Newark', 'Paterson', 'Philadelphia',
       'Pittsburgh', 'Reading', 'Rochester', 'Schenectady', 'Scranton',
       'Syracuse', 'Trenton', 'Utica', nan, 'Akron', 'Canton', 'Chicago',
       'Cincinnati', 'Cleveland', 'Columbus', 'Dayton', 'Detroit',
       'Evansville', 'Fort Wayne', 'Grand Rapids', 'Indianapolis',
       'Lansing', 'Milwaukee', 'Peoria', 'Rockford', 'Toledo', 'Duluth',
       'Minneapolis', 'Omaha', 'Saint Louis', 'Saint Paul', 'Wichita',
       'Atlanta', 'Wilimington', 'Baltimore', 'Charlotte', 'Jacksonville',
       'Birmingham', 'Baton Rouge', 'Corpus Christi', 'New Orlea

array([ 11.,   5.,  12.,   8.,  10.,  16.,  14.,   9.,  17.,  13.,   4.,
         2.,   6.,   3.,   1.,   7.,  18.,  28.,  23.,  22.,   0.,  20.,
        31.,  21.,  32.,  15.,  19.,  34.,  24.,  25.,  48.,  36.,  49.,
        30.,  29.,  27.,  38.,  37.,  44.,  26.,  35.,  33.,  55.,  46.,
        42.,  40.,  39.,  nan,  56.,  82.,  96.,  75.,  69.,  73.,  92.,
        93.,  98., 100.,  84.,  68.,  66.,  59.,  52.,  58.,  64.,  57.,
        45.,  43.,  41.,  47.,  50.,  71.,  67.,  60.,  72.,  95.,  91.,
       103., 118., 130., 171., 126., 101.,  74.,  77.,  79.,  70.,  80.,
        53.,  61., 132.,  83.,  54.,  51.,  87.,  65.,  63.,  90., 104.,
       110.,  62.,  76.,  78.,  86.,  97.,  89., 117., 123.,  85.,  88.,
       113., 174., 181., 150., 107.,  81., 102., 208., 242., 160., 120.,
       129., 106., 105., 124.,  99., 111., 108., 136.,  94., 114., 187.,
       280., 162., 131., 116., 112., 144., 119., 115., 145., 152., 175.,
       231., 148., 196., 154.])

array([ 262.,  270.,  237., ...,  814.,  923., 1738.])

array([ 10.,  14.,  11.,  22.,  15.,  16.,  19.,  12.,   6.,   7.,  18.,
        13.,   9.,  17.,   8.,  20.,  21.,   3.,  23.,  24.,   5.,   1.,
        27.,   4.,  30.,   2.,   0.,  nan,  26.,  91., 104.,  73., 102.,
        89.,  99.,  77., 100.,  84.,  76.,  96.,  94.,  95.,  80.,  87.,
        70.,  67.,  78., 101.,  58.,  93.,  85.,  81.,  69.,  86.,  61.,
        90.,  88., 105.,  83., 106., 112.,  75.,  63.,  82.,  66., 103.,
        71.,  64., 117.,  72., 113.,  79.,  62.,  97.,  57.,  68.,  74.,
        65.,  92.,  54.,  59.,  52.,  60.,  55.,  53.,  56.,  50.,  51.,
        46.,  32.,  39.,  41.,  45.,  98.,  48.,  49.,  40.,  42.,  44.,
        43.,  35.,  38.,  33.,  36.,  31.,  47.,  34.,  37.,  29.,  25.,
        28., 107., 123., 217., 173.])

array([  8.,   7.,   9.,  11.,  12.,  19.,   3.,  10.,   4.,  14.,  13.,
         6.,   5.,  18.,  15.,  17.,  16.,   2.,  21.,  24.,  20.,   0.,
         1.,  nan,  23.,  42.,  39.,  31.,  48.,  49.,  46.,  35.,  34.,
        63.,  58.,  50.,  41.,  38.,  29.,  28.,  32.,  37.,  40.,  51.,
        47.,  33.,  30.,  36.,  52.,  62.,  53.,  44.,  66.,  45.,  27.,
        43.,  55.,  56.,  54.,  59.,  60.,  65.,  57.,  68.,  64.,  73.,
        69.,  83.,  61.,  67.,  70.,  72.,  81.,  74.,  71.,  78.,  77.,
        88.,  79.,  75.,  25.,  26.,  22., 101.,  84.,  80.,  87.,  98.,
        90.,  86.,  85.,  92.,  76., 109.,  82.,  95., 100.,  94., 103.,
        99.,  91.,  89., 105.,  93., 125.,  97.,  96., 106., 104., 117.,
       108., 113., 111., 131.])

array([ 11.,  10.,   8.,  17.,  16.,  13.,  15.,  18.,  21.,  22.,  19.,
        26.,  12.,   9.,  14.,   4.,  23.,  20.,   7.,  24.,  25.,  30.,
         6.,  27.,   5.,  29.,  40.,  28.,   3.,  31.,  33.,  34.,  35.,
         2.,  nan,  32.,   0.,   1., 146., 116., 118.,  93., 106., 114.,
       126., 101.,  96., 139., 112., 104., 110., 100., 115., 102., 109.,
        94., 108., 128.,  99., 120., 124., 111.,  85., 117., 107., 129.,
        90.,  88., 123., 122., 121., 103.,  91.,  92., 105., 125., 127.,
       132., 119.,  97.,  95.,  98., 133., 141., 131.,  82., 130., 113.,
       137.,  87., 136., 134., 135.,  83., 142., 148., 151., 154., 176.,
       143., 149., 152., 183., 156., 140., 160., 153., 138.,  86.,  80.,
        77.,  84.,  76.,  78.,  71.,  89., 147., 144., 155.,  74.,  79.,
        81.,  75.,  60., 145., 150., 159., 164., 165., 182., 170., 173.,
       190., 169., 162., 195., 157., 163., 171., 168., 161., 158., 174.,
       177., 191., 181., 186., 179., 187., 197., 18

array([ 87.,  70.,  66.,  73.,  62.,  79.,  71.,  85.,  80.,  74.,  92.,
        76.,  59.,  75.,  72.,  55.,  84.,  61.,  69.,  50.,  78.,  54.,
        82.,  67.,  83.,  81., 110.,  77.,  89.,  57.,  60.,  52.,  58.,
        65.,  64.,  56.,  32.,  51.,  90.,  97.,  63.,  53.,  68.,  49.,
        46., 118.,  93.,  91.,  94.,  88.,  44.,  45.,  86.,  96., 111.,
        42.,  43.,  48.,  47., 102., 100.,  40.,  33.,  39., 103.,  34.,
        29.,  38.,  26.,  36.,  20.,  37.,  30.,  41.,  28.,  35.,  31.,
        23.,  25.,  27.,  24.,  22.,  21.,  12.,  19.,  18.,  15.,  17.,
        nan,   3.,   9.,   1.,   6.,   5.,  16.,  11.,  10.,   7.,  14.,
         8.,   4.,   2.,  13.,   0., 518., 531., 523., 522., 474., 534.,
       500., 535., 525., 581., 550., 544., 498., 504., 501., 468., 519.,
       490., 515., 450., 419., 467., 464., 541., 443., 462., 459., 461.,
       409., 440., 420., 438., 449., 456., 493., 471., 506., 499., 492.,
       494., 441., 555., 533., 616., 560., 548., 54

array([146., 167., 142., ..., 468., 531., 555.])

In [7]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 346342 entries, 0 to 346341
Data columns (total 13 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   Year                            346342 non-null  int64  
 1   WEEK                            346342 non-null  int64  
 2   Week Ending Date                346342 non-null  object 
 3   REGION                          346342 non-null  int64  
 4   State                           340300 non-null  object 
 5   City                            340300 non-null  object 
 6   Pneumonia and Influenza Deaths  343058 non-null  float64
 7   All Deaths                      344871 non-null  float64
 8   <1 year (all cause deaths)      338953 non-null  float64
 9   1-24 years (all cause deaths)   339374 non-null  float64
 10  25-44 years                     342711 non-null  float64
 11  45-64 years (all cause deaths)  344662 non-null  float64
 12  65+ years (all c

Unnamed: 0,Year,WEEK,REGION,Pneumonia and Influenza Deaths,All Deaths,<1 year (all cause deaths),1-24 years (all cause deaths),25-44 years,45-64 years (all cause deaths),65+ years (all cause deaths)
count,346342.0,346342.0,346342.0,343058.0,344871.0,338953.0,339374.0,342711.0,344662.0,344845.0
mean,1988.804497,26.499229,4.651001,5.435617,100.78113,3.283939,3.096304,7.320713,23.29007,63.899192
std,15.796506,15.038563,2.684513,7.569441,146.427288,5.523174,5.088646,13.279231,35.829121,91.762139
min,1962.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1975.0,13.0,2.0,1.0,37.0,0.0,1.0,2.0,7.0,24.0
50%,1989.0,26.0,4.0,3.0,63.0,2.0,2.0,4.0,14.0,41.0
75%,2002.0,39.0,7.0,7.0,119.0,4.0,4.0,8.0,27.0,76.0
max,2016.0,53.0,9.0,280.0,2550.0,217.0,131.0,489.0,728.0,1785.0


In [5]:
df.shape

(346342, 13)