In [2]:
import pandas as pd

In [3]:
raw = pd.read_csv("raw/County_Level_of_Community_Transmission.csv")

In [9]:
print(raw)

           state_name        county_name  fips_code        date  \
0               Texas        Hays County      48209  01/27/2020   
1               Idaho         Gem County      16045  01/27/2020   
2            Nebraska        Boyd County      31015  10/15/2020   
3               Texas        Knox County      48275  11/03/2020   
4               Idaho   Nez Perce County      16069  11/03/2020   
...               ...                ...        ...         ...   
3225217     Louisiana  Assumption Parish      22007  10/18/2022   
3225218  South Dakota        Hyde County      46069  10/18/2022   
3225219         Texas      Loving County      48301  10/18/2022   
3225220      Virginia     Lynchburg City      51680  10/18/2022   
3225221       Wyoming    Converse County      56009  10/18/2022   

        cases_per_100K_7_day_count_change  \
0                                   0.000   
1                                   0.000   
2                              suppressed   
3              

In [None]:
raw.head()

In [10]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3225222 entries, 0 to 3225221
Data columns (total 7 columns):
 #   Column                                              Dtype  
---  ------                                              -----  
 0   state_name                                          object 
 1   county_name                                         object 
 2   fips_code                                           int64  
 3   date                                                object 
 4   cases_per_100K_7_day_count_change                   object 
 5   percent_test_results_reported_positive_last_7_days  float64
 6   community_transmission_level                        object 
dtypes: float64(1), int64(1), object(5)
memory usage: 172.2+ MB


In [40]:
# raw.describe() treats fips_code (federal information processing standards) 
# as a numeric column and calculate statistical metrics
# so, convert fips_code to strings.
raw['fips_code'] = raw['fips_code'].astype(str)
# raw.describe() does not calculate metrics in column cases_per_100K_7_day_count_change 
# because there are non-numerical entries in this column.
# so, convert non-numeric to numeric.
raw['cases_per_100K_7_day_count_change'] = pd.to_numeric(raw['cases_per_100K_7_day_count_change'], 
errors = 'coerce')

In [41]:
raw.describe()

Unnamed: 0,cases_per_100K_7_day_count_change,percent_test_results_reported_positive_last_7_days
count,2468979.0,2823884.0
mean,189.8672,11.54418
std,201.6008,10.76036
min,-901.88,0.0
25%,38.795,3.79
50%,127.707,8.64
75%,273.0585,16.18
max,999.997,100.0


Interpretation

Cases_per_100K_7_day_count_change

count: 2,468,979 valid non-null values.
mean: The average 7-day case count change per 100,000 people is 189.87.
std: The standard deviation is 201.60, indicating significant variability in the case count changes.
min: The lowest change is -901.88 cases, which could indicate a large decrease in cases during a 7-day period.
Percentiles:
25%: 38.80 (25% of the data is below this value).
50% (median): 127.71 (half the data is below this value).
75%: 273.06 (25% of the data is above this value).
max: 999.997 cases per 100,000, likely indicating a very high increase.

Percent_test_results_reported_positive_last_7_days

count: 2,823,884 valid non-NaN values.
mean: The average positivity rate over the last 7 days is 11.54%.
std: The standard deviation is 10.76%, indicating some variation in positivity rates.
min: The lowest positivity rate is 0%, indicating areas with no positive cases in the last 7 days.
Percentiles:
25%: 3.79%.
50% (median): 8.64%.
75%: 16.18%.
max: 100%, suggesting some areas are reporting exclusively positive results.

A positivity rate of 100% may indicate areas where testing is limited or only symptomatic individuals are being tested.
The wide range from 0% to 100% suggests a significant disparity in how COVID-19 impacts different regions.


HANDLE NON-NUMERIC VALUES

In [42]:
import pandas as pd

In [43]:
clean = pd.read_csv("clean/clean_data_2.csv")

In [74]:
# Column 'cases_per_100K_7_day_count_change' has both numeric and categorical values.

# Convert all values to numeric.

# identify the non-numeric
non_numeric = clean.loc[~clean['cases_per_100K_7_day_count_change'].apply(lambda x: str(x).replace('.', '', 1).isdigit())]

# count non_numeric.
non_numeric_count = clean.loc[~clean['cases_per_100K_7_day_count_change'].apply(lambda x: str(x).replace('.', '', 1).isdigit())].shape[0]
non_numeric_count

# Replace non-numeric values with 0
clean['cases_per_100K_7_day_count_change'] = pd.to_numeric(
    clean['cases_per_100K_7_day_count_change'], errors='coerce'
).fillna(0)

# Verify the changes
print(clean['cases_per_100K_7_day_count_change'])

0            0.000
1            0.000
2            0.000
3          354.803
4          544.447
            ...   
3225217      0.000
3225218      0.000
3225219      0.000
3225220     26.770
3225221      0.000
Name: cases_per_100K_7_day_count_change, Length: 3225222, dtype: float64


MISSING DATA

In [52]:
import pandas as pd

In [53]:
raw = pd.read_csv("raw/County_Level_of_community_transmission.csv")

In [54]:
raw.isnull()

Unnamed: 0,state_name,county_name,fips_code,date,cases_per_100K_7_day_count_change,percent_test_results_reported_positive_last_7_days,community_transmission_level
0,False,False,False,False,False,True,True
1,False,False,False,False,False,True,True
2,False,False,False,False,False,False,False
3,False,False,False,False,False,True,False
4,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
3225217,False,False,False,False,False,False,False
3225218,False,False,False,False,False,True,False
3225219,False,False,False,False,False,False,False
3225220,False,False,False,False,False,False,False


In [55]:
raw.isnull().sum()

state_name                                              2662
county_name                                             2662
fips_code                                                  0
date                                                       0
cases_per_100K_7_day_count_change                          0
percent_test_results_reported_positive_last_7_days    401338
community_transmission_level                           22661
dtype: int64

In [56]:
# % of missing values in each column = number of missing values/total values * 100
raw.isnull().sum()/len(raw) * 100

state_name                                             0.082537
county_name                                            0.082537
fips_code                                              0.000000
date                                                   0.000000
cases_per_100K_7_day_count_change                      0.000000
percent_test_results_reported_positive_last_7_days    12.443733
community_transmission_level                           0.702618
dtype: float64

In [57]:
# Because the % of missing values is low in state_name column, 
# We'll fillin with a placeholder 'unknown'instead of mode.
# Save in a clean file, assign it to csv.
clean = raw.fillna({"state_name": "unknown"})
clean.to_csv("clean_data_2.csv", index=False)

In [58]:
clean.isnull().sum()

state_name                                                 0
county_name                                             2662
fips_code                                                  0
date                                                       0
cases_per_100K_7_day_count_change                          0
percent_test_results_reported_positive_last_7_days    401338
community_transmission_level                           22661
dtype: int64

In [20]:
import shutil
source = "clean_data_2.csv"
destination = "clean/"
shutil.move(source, destination)

Error: Destination path 'clean/clean_data_2.csv' already exists

In [59]:
# fillin county_name with the placeholder 'unknown'.
clean.fillna({"county_name": "unknown"}, inplace=True)

In [60]:
clean.isnull().sum()

state_name                                                 0
county_name                                                0
fips_code                                                  0
date                                                       0
cases_per_100K_7_day_count_change                          0
percent_test_results_reported_positive_last_7_days    401338
community_transmission_level                           22661
dtype: int64

In [61]:
# Filling column 'percent_test_results_reported_positive_last_7_days'.
# This column has numeric values. We'll fillin with a median_value.
median_value = clean["percent_test_results_reported_positive_last_7_days"].median()
print(median_value)

8.64


In [62]:
clean.fillna({"percent_test_results_reported_positive_last_7_days": 8.64}, inplace=True)

In [63]:
clean.isnull().sum()

state_name                                                0
county_name                                               0
fips_code                                                 0
date                                                      0
cases_per_100K_7_day_count_change                         0
percent_test_results_reported_positive_last_7_days        0
community_transmission_level                          22661
dtype: int64

In [64]:
# Filling column 'community_transmission_level' with a mode.
# we use iloc[0] just in case there are more than one mode.
mode_value = clean["community_transmission_level"].mode().iloc[0]
print(mode_value)

high


In [65]:
clean.fillna({"community_transmission_level": "high"}, inplace=True)

In [66]:
clean.isnull().sum()

state_name                                            0
county_name                                           0
fips_code                                             0
date                                                  0
cases_per_100K_7_day_count_change                     0
percent_test_results_reported_positive_last_7_days    0
community_transmission_level                          0
dtype: int64

DUPLICATES

In [67]:
clean.duplicated().sum()

np.int64(0)

In [68]:
duplicat_count = int(clean.duplicated().sum())
duplicat_count

0

DATETIME

In [69]:
# convert date column into a datetime object.
clean['date'] = pd.to_datetime(clean['date'], format='%m/%d/%Y', errors='coerce')

In [75]:
clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3225222 entries, 0 to 3225221
Data columns (total 7 columns):
 #   Column                                              Dtype         
---  ------                                              -----         
 0   state_name                                          object        
 1   county_name                                         object        
 2   fips_code                                           int64         
 3   date                                                datetime64[ns]
 4   cases_per_100K_7_day_count_change                   float64       
 5   percent_test_results_reported_positive_last_7_days  float64       
 6   community_transmission_level                        object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 172.2+ MB


In [71]:
clean

Unnamed: 0,state_name,county_name,fips_code,date,cases_per_100K_7_day_count_change,percent_test_results_reported_positive_last_7_days,community_transmission_level
0,Texas,Hays County,48209,2020-01-27,0.000,8.64,high
1,Idaho,Gem County,16045,2020-01-27,0.000,8.64,high
2,Nebraska,Boyd County,31015,2020-10-15,suppressed,45.00,high
3,Texas,Knox County,48275,2020-11-03,354.803,8.64,high
4,Idaho,Nez Perce County,16069,2020-11-03,544.447,27.68,high
...,...,...,...,...,...,...,...
3225217,Louisiana,Assumption Parish,22007,2022-10-18,suppressed,0.00,low
3225218,South Dakota,Hyde County,46069,2022-10-18,0.000,8.64,low
3225219,Texas,Loving County,48301,2022-10-18,suppressed,14.29,high
3225220,Virginia,Lynchburg City,51680,2022-10-18,26.770,8.01,substantial


In [72]:
clean.isnull().sum()

state_name                                            0
county_name                                           0
fips_code                                             0
date                                                  0
cases_per_100K_7_day_count_change                     0
percent_test_results_reported_positive_last_7_days    0
community_transmission_level                          0
dtype: int64

In [73]:
clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3225222 entries, 0 to 3225221
Data columns (total 7 columns):
 #   Column                                              Dtype         
---  ------                                              -----         
 0   state_name                                          object        
 1   county_name                                         object        
 2   fips_code                                           int64         
 3   date                                                datetime64[ns]
 4   cases_per_100K_7_day_count_change                   object        
 5   percent_test_results_reported_positive_last_7_days  float64       
 6   community_transmission_level                        object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(4)
memory usage: 172.2+ MB
