In [1]:
from github import Github
import os
import pickle
import base64
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
import plotly.express as px
from ipywidgets import interact
import datetime as dt
sns.set()

In [2]:
with open("github.txt") as myfile:
    firstNlines=myfile.readlines()[0:2]
myfile.close()
g = Github(firstNlines[0].strip(), firstNlines[1])

In [3]:
repo=g.get_repo('CSSEGISandData/COVID-19')
contents = repo.get_contents("")

In [4]:
def get_sha_for_tag(repository, tag):
    """
    Returns a commit PyGithub object for the specified repository and tag.
    """
    branches = repository.get_branches()
    matched_branches = [match for match in branches if match.name == tag]
    if matched_branches:
        return matched_branches[0].commit.sha

    tags = repository.get_tags()
    matched_tags = [match for match in tags if match.name == tag]
    if not matched_tags:
        raise ValueError('No Tag or Branch exists with that name')
    return matched_tags[0].commit.sha

In [5]:
def download_directory(repository, sha, server_path, local_path='data_csse/'):
    """
    Download all contents at server_path with commit tag sha in
    the repository.
    """
    contents = repository.get_contents(server_path, ref=sha)
    if not os.path.exists(local_path):
        os.makedirs(local_path)
    for content in contents:
        #print("Processing %s" % content.path)
        if content.type == 'dir':
            download_directory(repository, sha, content.path)
        else:
            try:
                path = content.path
                file_content = repository.get_contents(path, ref=sha)
                file_data = base64.b64decode(file_content.content).decode('ascii')
                file_out = open(local_path+content.name, "w")
                file_out.write(local_path+file_data)
                file_out.close()
            except:
                pass

In [6]:
sha = get_sha_for_tag(repo, 'master')

In [7]:
download_directory(repo, sha, 'csse_covid_19_data/csse_covid_19_daily_reports')

In [8]:
from os import listdir
from os.path import isfile, join
mypath = 'data_csse/'
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]

In [9]:
def is_non_zero_file(fpath):
    return os.path.isfile(fpath) and os.path.getsize(fpath) > 0

In [10]:
nonzero_files = []
for file in onlyfiles:
    if is_non_zero_file(mypath + file) and file[-3:]=='csv':
        nonzero_files.append(mypath + file)

In [11]:
df_list = []
for file in nonzero_files:
    df_1 = pd.read_csv(file)
    df_1['filename'] = file[-15:]
    df_list.append(df_1)  

In [12]:
df = pd.concat(df_list, ignore_index=True)

In [13]:
df['Country/Region'] = df['Country/Region'].str.replace('Mainland China', 'China')

In [14]:
df.head()

Unnamed: 0,data_csse/Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,filename,Latitude,Longitude
0,Hubei,China,2/1/2020 11:53,7153,249,168,/02-01-2020.csv,,
1,Zhejiang,China,2/1/2020 10:53,599,0,21,/02-01-2020.csv,,
2,Guangdong,China,2/1/2020 14:23,535,0,14,/02-01-2020.csv,,
3,Henan,China,2/1/2020 1:52,422,2,3,/02-01-2020.csv,,
4,Hunan,China,2/1/2020 11:03,389,0,8,/02-01-2020.csv,,


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5999 entries, 0 to 5998
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   data_csse/Province/State  3453 non-null   object 
 1   Country/Region            5999 non-null   object 
 2   Last Update               5999 non-null   object 
 3   Confirmed                 5999 non-null   int64  
 4   Deaths                    5999 non-null   int64  
 5   Recovered                 5999 non-null   int64  
 6   filename                  5999 non-null   object 
 7   Latitude                  3674 non-null   float64
 8   Longitude                 3674 non-null   float64
dtypes: float64(2), int64(3), object(4)
memory usage: 421.9+ KB


In [22]:
df = df.rename(columns={'data_csse/Province/State':'Province/State'})

In [23]:
df['Date'] = pd.to_datetime(df['Last Update'])

In [24]:
df['Diamond Princess'] = df['Province/State'].str.contains('Diamond Princess')

In [27]:
df = df.drop_duplicates()

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3732 entries, 0 to 5998
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Province/State    2202 non-null   object        
 1   Country/Region    3732 non-null   object        
 2   Last Update       3732 non-null   object        
 3   Confirmed         3732 non-null   int64         
 4   Deaths            3732 non-null   int64         
 5   Recovered         3732 non-null   int64         
 6   Latitude          2429 non-null   float64       
 7   Longitude         2429 non-null   float64       
 8   Date              3732 non-null   datetime64[ns]
 9   Diamond Princess  2202 non-null   object        
dtypes: datetime64[ns](1), float64(2), int64(3), object(4)
memory usage: 320.7+ KB


In [29]:
df['Province/State'] = df['Province/State'].str.replace(' \(From Diamond Princess\)', '').replace('None', 'From Diamond Princess')

In [30]:
df.head()

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Latitude,Longitude,Date,Diamond Princess
0,Hubei,China,2/1/2020 11:53,7153,249,168,,,2020-02-01 11:53:00,False
1,Zhejiang,China,2/1/2020 10:53,599,0,21,,,2020-02-01 10:53:00,False
2,Guangdong,China,2/1/2020 14:23,535,0,14,,,2020-02-01 14:23:00,False
3,Henan,China,2/1/2020 1:52,422,2,3,,,2020-02-01 01:52:00,False
4,Hunan,China,2/1/2020 11:03,389,0,8,,,2020-02-01 11:03:00,False


In [31]:
df['DateTime'] = df['Date']

In [32]:
df['Date'] = df['DateTime'].apply(lambda x: x.date())

In [33]:
df['Date'] = df['Date'].apply(lambda x: pd.to_datetime(x))

In [34]:
df['Date'] = df['Date'].apply(lambda x: str(x))

In [35]:
# faulty data, 1/31 has only 38 confirmed cases in China. We'll start data using 2/1.
jan_31 = df[df['Date']=='2020-01-31 00:00:00'].index
df.drop(jan_31, inplace=True)

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3710 entries, 0 to 5998
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Province/State    2198 non-null   object        
 1   Country/Region    3710 non-null   object        
 2   Last Update       3710 non-null   object        
 3   Confirmed         3710 non-null   int64         
 4   Deaths            3710 non-null   int64         
 5   Recovered         3710 non-null   int64         
 6   Latitude          2429 non-null   float64       
 7   Longitude         2429 non-null   float64       
 8   Date              3710 non-null   object        
 9   Diamond Princess  2198 non-null   object        
 10  DateTime          3710 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(2), int64(3), object(5)
memory usage: 347.8+ KB


In [37]:
df.head()

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Latitude,Longitude,Date,Diamond Princess,DateTime
0,Hubei,China,2/1/2020 11:53,7153,249,168,,,2020-02-01 00:00:00,False,2020-02-01 11:53:00
1,Zhejiang,China,2/1/2020 10:53,599,0,21,,,2020-02-01 00:00:00,False,2020-02-01 10:53:00
2,Guangdong,China,2/1/2020 14:23,535,0,14,,,2020-02-01 00:00:00,False,2020-02-01 14:23:00
3,Henan,China,2/1/2020 1:52,422,2,3,,,2020-02-01 00:00:00,False,2020-02-01 01:52:00
4,Hunan,China,2/1/2020 11:03,389,0,8,,,2020-02-01 00:00:00,False,2020-02-01 11:03:00


In [38]:
df['Active'] = df['Confirmed']-df['Deaths']-df['Recovered']

In [39]:
df.loc[df['Country/Region']=='South Africa']

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Latitude,Longitude,Date,Diamond Princess,DateTime,Active
3048,,South Africa,2020-03-05T13:53:03,1,0,0,-30.5595,22.9375,2020-03-05 00:00:00,,2020-03-05 13:53:03,1
3236,,South Africa,2020-03-06T15:43:02,1,0,0,-30.5595,22.9375,2020-03-06 00:00:00,,2020-03-06 15:43:02,1
3641,,South Africa,2020-03-08T10:23:04,3,0,0,-30.5595,22.9375,2020-03-08 00:00:00,,2020-03-08 10:23:04,3
4140,,South Africa,2020-03-10T05:13:07,7,0,0,-30.5595,22.9375,2020-03-10 00:00:00,,2020-03-10 05:13:07,7
4333,,South Africa,2020-03-11T10:33:03,13,0,0,-30.5595,22.9375,2020-03-11 00:00:00,,2020-03-11 10:33:03,13
4552,,South Africa,2020-03-12T09:13:24,17,0,0,-30.5595,22.9375,2020-03-12 00:00:00,,2020-03-12 09:13:24,17
4764,,South Africa,2020-03-14T16:14:56,38,0,0,-30.5595,22.9375,2020-03-14 00:00:00,,2020-03-14 16:14:56,38
5005,,South Africa,2020-03-15T18:20:18,51,0,0,-30.5595,22.9375,2020-03-15 00:00:00,,2020-03-15 18:20:18,51
5264,,South Africa,2020-03-16T14:38:45,62,0,0,-30.5595,22.9375,2020-03-16 00:00:00,,2020-03-16 14:38:45,62
5804,,South Africa,2020-03-18T10:53:02,116,0,0,-30.5595,22.9375,2020-03-18 00:00:00,,2020-03-18 10:53:02,116


In [40]:
df['Log Confirmed'] = np.log10(df['Confirmed']+1)

In [41]:
grouped_df = df.groupby(['Date', 'Country/Region']).agg({'Confirmed': 'sum', 'Recovered': 'sum',
                                                    'Active': 'sum', 'Deaths': 'sum'}).reset_index()

In [42]:
grouped_df['Log Confirmed'] = np.log10(grouped_df['Confirmed']+1)

In [43]:
grouped_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1724 entries, 0 to 1723
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Date            1724 non-null   object 
 1   Country/Region  1724 non-null   object 
 2   Confirmed       1724 non-null   int64  
 3   Recovered       1724 non-null   int64  
 4   Active          1724 non-null   int64  
 5   Deaths          1724 non-null   int64  
 6   Log Confirmed   1724 non-null   float64
dtypes: float64(1), int64(4), object(2)
memory usage: 94.4+ KB


1. Check for each country if it has entries for every date
2. If country is missing entry for that date, use most previous entry

In [44]:
country_list = grouped_df['Country/Region'].unique()

In [45]:
country_list

array(['Australia', 'Canada', 'China', 'France', 'Germany', 'Hong Kong',
       'Japan', 'Philippines', 'Singapore', 'South Korea', 'Spain',
       'Sweden', 'UK', 'US', 'Vietnam', 'India', 'Macau',
       'United Arab Emirates', 'Belgium', 'Malaysia', 'Taiwan',
       'Thailand', 'Italy', 'Others', 'Sri Lanka', 'Cambodia', 'Finland',
       'Nepal', 'Russia', 'Egypt', 'Iran', 'Israel', 'Lebanon', 'Iraq',
       'Afghanistan', 'Bahrain', 'Kuwait', 'Oman', 'Algeria', 'Austria',
       'Croatia', 'Switzerland', 'Brazil', 'Georgia', 'Greece',
       'North Macedonia', 'Norway', 'Pakistan', 'Romania', 'Denmark',
       'Estonia', 'Netherlands', 'San Marino', ' Azerbaijan', 'Belarus',
       'Iceland', 'Lithuania', 'Mexico', 'New Zealand', 'Nigeria',
       'North Ireland', 'Ireland', 'Luxembourg', 'Monaco', 'Qatar',
       'Armenia', 'Azerbaijan', 'Czech Republic', 'Dominican Republic',
       'Ecuador', 'Andorra', 'Indonesia', 'Latvia', 'Morocco', 'Portugal',
       'Saudi Arabia', 'Seneg

In [46]:
total_dates = list(set(grouped_df['Date'].unique()))

In [47]:
temp_dict = {}
for country in country_list:
    date_list = list(grouped_df[grouped_df['Country/Region']==country]['Date'])
    missing_dates = list(set(total_dates)-set(date_list))
    temp_dict[country]=missing_dates
    
#     df_temp = pd.DataFrame({'Country/Region':country, 'Date':missing_dates})
#     grouped_df_2.append(df_temp, ignore_index=True)
    

In [48]:
for key in temp_dict:
    print(key)

Australia
Canada
China
France
Germany
Hong Kong
Japan
Philippines
Singapore
South Korea
Spain
Sweden
UK
US
Vietnam
India
Macau
United Arab Emirates
Belgium
Malaysia
Taiwan
Thailand
Italy
Others
Sri Lanka
Cambodia
Finland
Nepal
Russia
Egypt
Iran
Israel
Lebanon
Iraq
Afghanistan
Bahrain
Kuwait
Oman
Algeria
Austria
Croatia
Switzerland
Brazil
Georgia
Greece
North Macedonia
Norway
Pakistan
Romania
Denmark
Estonia
Netherlands
San Marino
 Azerbaijan
Belarus
Iceland
Lithuania
Mexico
New Zealand
Nigeria
North Ireland
Ireland
Luxembourg
Monaco
Qatar
Armenia
Azerbaijan
Czech Republic
Dominican Republic
Ecuador
Andorra
Indonesia
Latvia
Morocco
Portugal
Saudi Arabia
Senegal
Argentina
Chile
Jordan
Ukraine
Faroe Islands
Gibraltar
Hungary
Liechtenstein
Poland
Saint Barthelemy
Tunisia
Bosnia and Herzegovina
Palestine
Slovenia
South Africa
Bhutan
Cameroon
Colombia
Costa Rica
Peru
Serbia
Slovakia
Togo
Vatican City
French Guiana
Malta
Martinique
Bangladesh
Bulgaria
Maldives
Moldova
Paraguay
Republic of Ire

In [49]:
missing_dates_df = pd.DataFrame.from_dict(temp_dict, orient='index')

In [50]:
missing_dates_df = missing_dates_df.reset_index()

In [51]:
missing_dates_df.head()

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,36,37,38,39,40,41,42,43,44,45
0,Australia,2020-03-12 00:00:00,2020-02-15 00:00:00,2020-02-23 00:00:00,2020-02-18 00:00:00,2020-02-05 00:00:00,2020-02-11 00:00:00,2020-02-12 00:00:00,2020-02-28 00:00:00,2020-02-25 00:00:00,...,,,,,,,,,,
1,Canada,2020-02-02 00:00:00,2020-02-13 00:00:00,2020-02-22 00:00:00,2020-02-06 00:00:00,2020-02-11 00:00:00,2020-02-19 00:00:00,2020-02-10 00:00:00,2020-02-08 00:00:00,2020-02-26 00:00:00,...,,,,,,,,,,
2,China,,,,,,,,,,...,,,,,,,,,,
3,France,2020-02-04 00:00:00,2020-03-12 00:00:00,2020-02-23 00:00:00,2020-02-18 00:00:00,2020-02-02 00:00:00,2020-02-13 00:00:00,2020-02-05 00:00:00,2020-02-21 00:00:00,2020-02-11 00:00:00,...,,,,,,,,,,
4,Germany,2020-02-06 00:00:00,2020-02-22 00:00:00,2020-02-05 00:00:00,2020-02-19 00:00:00,2020-03-13 00:00:00,2020-02-10 00:00:00,2020-02-08 00:00:00,2020-02-20 00:00:00,2020-02-04 00:00:00,...,,,,,,,,,,


In [52]:
missing_dates_df = pd.melt(missing_dates_df, id_vars='index')

In [53]:
missing_dates_df.drop(columns=['variable'], inplace=True)

In [54]:
missing_dates_df

Unnamed: 0,index,value
0,Australia,2020-03-12 00:00:00
1,Canada,2020-02-02 00:00:00
2,China,
3,France,2020-02-04 00:00:00
4,Germany,2020-02-06 00:00:00
...,...,...
8827,Djibouti,2020-02-29 00:00:00
8828,"Gambia, The",2020-02-29 00:00:00
8829,Kyrgyzstan,2020-02-29 00:00:00
8830,Mauritius,2020-02-29 00:00:00


In [55]:
missing_dates_df = missing_dates_df.rename(columns={'index': 'Country/Region', 'value': 'Date'})

In [56]:
missing_dates_df.loc[missing_dates_df['Country/Region']=='China']

Unnamed: 0,Country/Region,Date
2,China,
194,China,
386,China,
578,China,
770,China,
962,China,
1154,China,
1346,China,
1538,China,
1730,China,


In [57]:
grouped_df = pd.concat([grouped_df, missing_dates_df], ignore_index=True)

In [58]:
grouped_df = grouped_df.sort_values(by=['Country/Region', 'Date'])

In [59]:
mask = grouped_df['Date']=='2020-02-01 00:00:00'
grouped_df = grouped_df.mask(mask, grouped_df.fillna(0))

In [60]:
grouped_df.head()

Unnamed: 0,Date,Country/Region,Confirmed,Recovered,Active,Deaths,Log Confirmed
7153,2020-02-01 00:00:00,Azerbaijan,0.0,0.0,0.0,0.0,0.0
3121,2020-02-02 00:00:00,Azerbaijan,,,,,
9649,2020-02-03 00:00:00,Azerbaijan,,,,,
1969,2020-02-04 00:00:00,Azerbaijan,,,,,
3505,2020-02-05 00:00:00,Azerbaijan,,,,,


In [61]:
grouped_df = grouped_df.ffill()

In [62]:
grouped_df = grouped_df.drop_duplicates()

In [63]:
grouped_df.loc[grouped_df['Country/Region']=='China'].describe()

Unnamed: 0,Confirmed,Recovered,Active,Deaths,Log Confirmed
count,47.0,47.0,47.0,47.0,47.0
mean,62797.425532,27945.0,32756.829787,2095.595745,4.735056
std,22727.131309,23054.08212,16748.992585,1029.404929,0.313519
min,1427.0,273.0,11.0,9.0,3.154728
25%,44484.0,4855.0,18740.0,1113.5,4.648208
50%,74545.0,23151.0,32264.0,2443.0,4.872424
75%,78365.0,52319.0,48410.5,3014.5,4.894128
max,80921.0,63089.0,58041.0,3188.0,4.908067


In [64]:
grouped_df.head()

Unnamed: 0,Date,Country/Region,Confirmed,Recovered,Active,Deaths,Log Confirmed
7153,2020-02-01 00:00:00,Azerbaijan,0.0,0.0,0.0,0.0,0.0
3121,2020-02-02 00:00:00,Azerbaijan,0.0,0.0,0.0,0.0,0.0
9649,2020-02-03 00:00:00,Azerbaijan,0.0,0.0,0.0,0.0,0.0
1969,2020-02-04 00:00:00,Azerbaijan,0.0,0.0,0.0,0.0,0.0
3505,2020-02-05 00:00:00,Azerbaijan,0.0,0.0,0.0,0.0,0.0


In [65]:
grouped_df.sort_values(['Country/Region', 'Date'], inplace=True)

In [66]:
grouped_df.loc[grouped_df['Date']=='2020-03-12 00:00:00'].describe()

Unnamed: 0,Confirmed,Recovered,Active,Deaths,Log Confirmed
count,192.0,192.0,192.0,192.0,192.0
mean,779.661458,357.510417,395.588542,26.5625,1.077129
std,5813.805359,4327.888663,1699.464188,236.277537,1.069605
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,6.0,0.0,5.0,0.0,0.845098
75%,49.75,2.0,43.25,0.0,1.705296
max,77817.0,59870.0,14812.0,3135.0,4.89108


In [67]:
grouped_df.loc[grouped_df['Date']=='2020-03-12 00:00:00'].describe()['Active']['max']

14812.0

In [68]:
grouped_df['Date'] = pd.to_datetime(grouped_df['Date'])

In [69]:
grouped_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9024 entries, 7153 to 2812
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Date            9024 non-null   datetime64[ns]
 1   Country/Region  9024 non-null   object        
 2   Confirmed       9024 non-null   float64       
 3   Recovered       9024 non-null   float64       
 4   Active          9024 non-null   float64       
 5   Deaths          9024 non-null   float64       
 6   Log Confirmed   9024 non-null   float64       
dtypes: datetime64[ns](1), float64(5), object(1)
memory usage: 564.0+ KB


In [70]:
def assign_bins(df):
    first_quartile = _df.loc[df['Date']==date].describe()['Active']['25%']
    second_quartile = df.loc[df['Date']==date].describe()['Active']['50%']
    third_quartile = df.loc[df['Date']==date].describe()['Active']['75%']
    fourth_quartile = df.loc[grouped_df['Date']==date].describe()['Active']['max']
for date in total_dates:
    first_quartile = grouped_df.loc[grouped_df['Date']==date].describe()['Active']['25%']
    second_quartile = grouped_df.loc[grouped_df['Date']==date].describe()['Active']['50%']
    third_quartile = grouped_df.loc[grouped_df['Date']==date].describe()['Active']['75%']
    fourth_quartile = grouped_df.loc[grouped_df['Date']==date].describe()['Active']['max']
    if 0<cases<first_quartile:
        bins=1
    elif first_quartile<cases<second_quartile:
        bins=2
    elif second_quartile<cases<third_quartile:
        bins=3
    elif third_quartile<cases<=fourth_quartile:
        bins=4

NameError: name 'cases' is not defined

In [76]:
for date in total_dates:
    max_confirmed = grouped_df.loc[grouped_df['Date']==date]['Confirmed'].max()


In [72]:
def bins_percentage(max_confirmed):

SyntaxError: unexpected EOF while parsing (<ipython-input-72-649949ce8cce>, line 1)

In [73]:
@interact(Country = grouped_df['Country/Region'].sort_values().unique(), Confirmed=True, Active=True, Recovered=True, Deaths=True)
def viz(Country, Confirmed, Active, Recovered, Deaths):
    fig = px.line(title='Cases Over Time')
    if Confirmed:
        country_df = pd.DataFrame(grouped_df.loc[grouped_df['Country/Region']==Country].groupby('Date')['Confirmed'].agg('sum'))
        fig.add_scatter(x = country_df.index, y = country_df['Confirmed'], mode='lines', name='Confirmed Cases')
    if Active:
        country_df = pd.DataFrame(grouped_df.loc[grouped_df['Country/Region']==Country].groupby('Date')['Active'].agg('sum'))
        fig.add_scatter(x = country_df.index, y = country_df['Active'], mode='lines', name='Active Cases')
    if Recovered:
        country_df = pd.DataFrame(grouped_df.loc[grouped_df['Country/Region']==Country].groupby('Date')['Recovered'].agg('sum'))
        fig.add_scatter(x = country_df.index, y = country_df['Recovered'], mode='lines', name='Recovered Cases')
    if Deaths:
        country_df = pd.DataFrame(grouped_df.loc[grouped_df['Country/Region']==Country].groupby('Date')['Deaths'].agg('sum'))
        fig.add_scatter(x = country_df.index, y = country_df['Deaths'], mode='lines', name='Deceased Cases')
    fig.update_xaxes(title='Date', range=[dt.date(2020, 2, 1), dt.datetime.now()])
    fig.show()

interactive(children=(Dropdown(description='Country', options=(' Azerbaijan', 'Afghanistan', 'Albania', 'Alger…

In [75]:
s=pd.__version__.replace('.','_')
file_name = 'df_pdv_'+s+'.pickle'
grouped_df.to_pickle(file_name)