In [1]:
import pandas as pd

In [None]:
!pip install pandas

# Data Staging: 
## Extract, Transform and Load (ETL)

## Conceptual model:

![Conceptual%20model.png](attachment:Conceptual%20model.png)

# Extract

In [2]:
df= pd.read_csv('Cookie Company Financials.csv')

In [3]:
df

Unnamed: 0,Country,Product,Units Sold,Revenue per cookie,Cost per cookie,Date,Month Number,Month Name,Year
0,Canada,Chocolate Chip,292.0,$5.00,$2.00,01-02-2019,2,February,2019
1,Mexico,Chocolate Chip,974.0,$5.00,$2.00,01-02-2019,2,February,2019
2,Canada,Chocolate Chip,2518.0,$5.00,$2.00,01-06-2019,6,June,2019
3,Germany,Chocolate Chip,1006.0,$5.00,$2.00,01-06-2019,6,June,2019
4,Germany,Chocolate Chip,367.0,$5.00,$2.00,01-07-2019,7,July,2019
...,...,...,...,...,...,...,...,...,...
695,France,White Chocolate Macadamia Nut,2826.0,$6.00,$2.75,01-05-2019,5,May,2019
696,France,White Chocolate Macadamia Nut,663.0,$6.00,$2.75,01-09-2019,9,September,2019
697,United States,White Chocolate Macadamia Nut,2574.0,$6.00,$2.75,01-11-2018,11,November,2018
698,United States,White Chocolate Macadamia Nut,2438.0,$6.00,$2.75,01-12-2018,12,December,2018


## Handle Outliers

In [4]:
df[:25]

Unnamed: 0,Country,Product,Units Sold,Revenue per cookie,Cost per cookie,Date,Month Number,Month Name,Year
0,Canada,Chocolate Chip,292.0,$5.00,$2.00,01-02-2019,2,February,2019
1,Mexico,Chocolate Chip,974.0,$5.00,$2.00,01-02-2019,2,February,2019
2,Canada,Chocolate Chip,2518.0,$5.00,$2.00,01-06-2019,6,June,2019
3,Germany,Chocolate Chip,1006.0,$5.00,$2.00,01-06-2019,6,June,2019
4,Germany,Chocolate Chip,367.0,$5.00,$2.00,01-07-2019,7,July,2019
5,Mexico,Chocolate Chip,883.0,$5.00,$2.00,01-08-2019,8,August,2019
6,France,Chocolate Chip,549.0,$5.00,$2.00,01-09-2018,9,September,2018
7,Mexico,Chocolate Chip,788.0,$5.00,$2.00,01-09-2018,9,September,2018
8,Mexico,Chocolate Chip,2472.0,$5.00,$2.00,01-09-2019,9,September,2019
9,United States,Chocolate Chip,1143.0,$5.00,$2.00,01-10-2019,10,October,2019


In [5]:
print(df.dtypes)

Country                object
Product                object
Units Sold            float64
Revenue per cookie     object
Cost per cookie        object
Date                   object
Month Number            int64
Month Name             object
Year                    int64
dtype: object


# Transform

In [6]:
df['Units Sold'] = df['Units Sold'].astype(int)

In [7]:
df[:25]

Unnamed: 0,Country,Product,Units Sold,Revenue per cookie,Cost per cookie,Date,Month Number,Month Name,Year
0,Canada,Chocolate Chip,292,$5.00,$2.00,01-02-2019,2,February,2019
1,Mexico,Chocolate Chip,974,$5.00,$2.00,01-02-2019,2,February,2019
2,Canada,Chocolate Chip,2518,$5.00,$2.00,01-06-2019,6,June,2019
3,Germany,Chocolate Chip,1006,$5.00,$2.00,01-06-2019,6,June,2019
4,Germany,Chocolate Chip,367,$5.00,$2.00,01-07-2019,7,July,2019
5,Mexico,Chocolate Chip,883,$5.00,$2.00,01-08-2019,8,August,2019
6,France,Chocolate Chip,549,$5.00,$2.00,01-09-2018,9,September,2018
7,Mexico,Chocolate Chip,788,$5.00,$2.00,01-09-2018,9,September,2018
8,Mexico,Chocolate Chip,2472,$5.00,$2.00,01-09-2019,9,September,2019
9,United States,Chocolate Chip,1143,$5.00,$2.00,01-10-2019,10,October,2019


### Removing '$' sign and change data types

In [8]:
df['Revenue per cookie'] = df['Revenue per cookie'].str.replace('$','').str.replace(',','').astype('float')

In [9]:
#df['Revenue per cookie'] = df['Revenue per cookie']*conversion_rate

In [10]:
df['Cost per cookie'] = df['Cost per cookie'].str.replace('$','').str.replace(',','').astype('float')

In [11]:
df.head()

Unnamed: 0,Country,Product,Units Sold,Revenue per cookie,Cost per cookie,Date,Month Number,Month Name,Year
0,Canada,Chocolate Chip,292,5.0,2.0,01-02-2019,2,February,2019
1,Mexico,Chocolate Chip,974,5.0,2.0,01-02-2019,2,February,2019
2,Canada,Chocolate Chip,2518,5.0,2.0,01-06-2019,6,June,2019
3,Germany,Chocolate Chip,1006,5.0,2.0,01-06-2019,6,June,2019
4,Germany,Chocolate Chip,367,5.0,2.0,01-07-2019,7,July,2019


### Drop unqrequired columns

In [12]:
df = df.drop(columns = ['Date','Month Name'])

In [13]:
df

Unnamed: 0,Country,Product,Units Sold,Revenue per cookie,Cost per cookie,Month Number,Year
0,Canada,Chocolate Chip,292,5.0,2.00,2,2019
1,Mexico,Chocolate Chip,974,5.0,2.00,2,2019
2,Canada,Chocolate Chip,2518,5.0,2.00,6,2019
3,Germany,Chocolate Chip,1006,5.0,2.00,6,2019
4,Germany,Chocolate Chip,367,5.0,2.00,7,2019
...,...,...,...,...,...,...,...
695,France,White Chocolate Macadamia Nut,2826,6.0,2.75,5,2019
696,France,White Chocolate Macadamia Nut,663,6.0,2.75,9,2019
697,United States,White Chocolate Macadamia Nut,2574,6.0,2.75,11,2018
698,United States,White Chocolate Macadamia Nut,2438,6.0,2.75,12,2018


## Generate measures/facts

In [14]:
df['Revenue'] = df['Units Sold']*df['Revenue per cookie']
df['Cost'] = df['Units Sold']*df['Cost per cookie']
df['Profit'] = df['Revenue'] - df['Cost']

In [15]:
df

Unnamed: 0,Country,Product,Units Sold,Revenue per cookie,Cost per cookie,Month Number,Year,Revenue,Cost,Profit
0,Canada,Chocolate Chip,292,5.0,2.00,2,2019,1460.0,584.00,876.00
1,Mexico,Chocolate Chip,974,5.0,2.00,2,2019,4870.0,1948.00,2922.00
2,Canada,Chocolate Chip,2518,5.0,2.00,6,2019,12590.0,5036.00,7554.00
3,Germany,Chocolate Chip,1006,5.0,2.00,6,2019,5030.0,2012.00,3018.00
4,Germany,Chocolate Chip,367,5.0,2.00,7,2019,1835.0,734.00,1101.00
...,...,...,...,...,...,...,...,...,...,...
695,France,White Chocolate Macadamia Nut,2826,6.0,2.75,5,2019,16956.0,7771.50,9184.50
696,France,White Chocolate Macadamia Nut,663,6.0,2.75,9,2019,3978.0,1823.25,2154.75
697,United States,White Chocolate Macadamia Nut,2574,6.0,2.75,11,2018,15444.0,7078.50,8365.50
698,United States,White Chocolate Macadamia Nut,2438,6.0,2.75,12,2018,14628.0,6704.50,7923.50


In [16]:
df = df.rename({'Month Number':'Month'}, axis='columns')

In [17]:
df.head()

Unnamed: 0,Country,Product,Units Sold,Revenue per cookie,Cost per cookie,Month,Year,Revenue,Cost,Profit
0,Canada,Chocolate Chip,292,5.0,2.0,2,2019,1460.0,584.0,876.0
1,Mexico,Chocolate Chip,974,5.0,2.0,2,2019,4870.0,1948.0,2922.0
2,Canada,Chocolate Chip,2518,5.0,2.0,6,2019,12590.0,5036.0,7554.0
3,Germany,Chocolate Chip,1006,5.0,2.0,6,2019,5030.0,2012.0,3018.0
4,Germany,Chocolate Chip,367,5.0,2.0,7,2019,1835.0,734.0,1101.0


# Extract World Population dataset from the web

In [18]:
df['Country'].unique()

array(['Canada', 'Mexico', 'Germany', 'France', 'United States'],
      dtype=object)

In [19]:
df_country = pd.read_csv('Country Population.csv')

In [20]:
df_country.head()

Unnamed: 0,Country / Dependency,Population Numbers
0,United States,334233854
1,Mexico,128665641
2,Germany,84270625
3,France,68042591
4,Canada,39110420


## Transform

In [21]:
df_country.dtypes

Country / Dependency    object
Population Numbers       int64
dtype: object

In [22]:
df_country = df_country.rename({'Country / Dependency': 'Country', 'Population Numbers': 'Population'}, axis='columns')
df_country.head()

Unnamed: 0,Country,Population
0,United States,334233854
1,Mexico,128665641
2,Germany,84270625
3,France,68042591
4,Canada,39110420


## Dataset Integration (Enrich sales data)

In [32]:
result = pd.merge(df, df_country, how="left", on=["Country"])
result

Unnamed: 0,Country,Product,Units Sold,Revenue per cookie,Cost per cookie,Date,Month,Month Name,Year,Revenue,Cost,Profit,Population
0,Canada,Chocolate Chip,292,5.0,2.00,01-02-2019,2,February,2019,1460.0,584.00,876.00,39110420
1,Mexico,Chocolate Chip,974,5.0,2.00,01-02-2019,2,February,2019,4870.0,1948.00,2922.00,128665641
2,Canada,Chocolate Chip,2518,5.0,2.00,01-06-2019,6,June,2019,12590.0,5036.00,7554.00,39110420
3,Germany,Chocolate Chip,1006,5.0,2.00,01-06-2019,6,June,2019,5030.0,2012.00,3018.00,84270625
4,Germany,Chocolate Chip,367,5.0,2.00,01-07-2019,7,July,2019,1835.0,734.00,1101.00,84270625
...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,France,White Chocolate Macadamia Nut,2826,6.0,2.75,01-05-2019,5,May,2019,16956.0,7771.50,9184.50,68042591
696,France,White Chocolate Macadamia Nut,663,6.0,2.75,01-09-2019,9,September,2019,3978.0,1823.25,2154.75,68042591
697,United States,White Chocolate Macadamia Nut,2574,6.0,2.75,01-11-2018,11,November,2018,15444.0,7078.50,8365.50,334233854
698,United States,White Chocolate Macadamia Nut,2438,6.0,2.75,01-12-2018,12,December,2018,14628.0,6704.50,7923.50,334233854


### Check for null values after merge

In [33]:
result.isnull().sum()

Country               0
Product               0
Units Sold            0
Revenue per cookie    0
Cost per cookie       0
Date                  0
Month                 0
Month Name            0
Year                  0
Revenue               0
Cost                  0
Profit                0
Population            0
dtype: int64

In [34]:
result.dtypes

Country                object
Product                object
Units Sold              int32
Revenue per cookie    float64
Cost per cookie       float64
Date                   object
Month                   int64
Month Name             object
Year                    int64
Revenue               float64
Cost                  float64
Profit                float64
Population              int64
dtype: object

### How to replace values in a column with the desired value

In [35]:
result['Year'].unique()

array([2019, 2018], dtype=int64)

In [36]:
result['Year'] = result['Year'].replace(2019,2022).replace(2018,2021)
result.head()

Unnamed: 0,Country,Product,Units Sold,Revenue per cookie,Cost per cookie,Date,Month,Month Name,Year,Revenue,Cost,Profit,Population
0,Canada,Chocolate Chip,292,5.0,2.0,01-02-2019,2,February,2022,1460.0,584.0,876.0,39110420
1,Mexico,Chocolate Chip,974,5.0,2.0,01-02-2019,2,February,2022,4870.0,1948.0,2922.0,128665641
2,Canada,Chocolate Chip,2518,5.0,2.0,01-06-2019,6,June,2022,12590.0,5036.0,7554.0,39110420
3,Germany,Chocolate Chip,1006,5.0,2.0,01-06-2019,6,June,2022,5030.0,2012.0,3018.0,84270625
4,Germany,Chocolate Chip,367,5.0,2.0,01-07-2019,7,July,2022,1835.0,734.0,1101.0,84270625


In [37]:
result['Year'].unique()

array([2022, 2021], dtype=int64)

# Extract Covid Data

In [38]:
df_covid = pd.read_csv('covid-data.csv')

In [39]:
df_covid

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,,,,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,,,,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,,,,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,,,,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255591,ZWE,Africa,Zimbabwe,2023-02-02,262324.0,,,5658.0,0.0,0.857,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,
255592,ZWE,Africa,Zimbabwe,2023-02-03,262324.0,,,5658.0,0.0,0.857,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,
255593,ZWE,Africa,Zimbabwe,2023-02-04,262324.0,,,5658.0,0.0,0.857,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,
255594,ZWE,Africa,Zimbabwe,2023-02-05,262324.0,,,5658.0,0.0,0.857,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,


In [40]:
df_covid.columns

Index(['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases',
       'new_cases_smoothed', 'total_deaths', 'new_deaths',
       'new_deaths_smoothed', 'total_cases_per_million',
       'new_cases_per_million', 'new_cases_smoothed_per_million',
       'total_deaths_per_million', 'new_deaths_per_million',
       'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients',
       'icu_patients_per_million', 'hosp_patients',
       'hosp_patients_per_million', 'weekly_icu_admissions',
       'weekly_icu_admissions_per_million', 'weekly_hosp_admissions',
       'weekly_hosp_admissions_per_million', 'total_tests', 'new_tests',
       'total_tests_per_thousand', 'new_tests_per_thousand',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated', 'total_boosters',
       'new_vaccinations', 'new_vaccinations_smoothed',
       't

# Transform

## Select Desired Columns

In [41]:
df_covid[['location','total_cases','date']]

Unnamed: 0,location,total_cases,date
0,Afghanistan,5.0,2020-02-24
1,Afghanistan,5.0,2020-02-25
2,Afghanistan,5.0,2020-02-26
3,Afghanistan,5.0,2020-02-27
4,Afghanistan,5.0,2020-02-28
...,...,...,...
255591,Zimbabwe,262324.0,2023-02-02
255592,Zimbabwe,262324.0,2023-02-03
255593,Zimbabwe,262324.0,2023-02-04
255594,Zimbabwe,262324.0,2023-02-05


## Check for null values

In [43]:
df_covid['total_cases'].isnull().sum()

14420

### Dropping null values

In [44]:
df_cases = df_covid[['location','total_cases','date']]
df_cases = df_cases.dropna()

In [45]:
df_cases

Unnamed: 0,location,total_cases,date
0,Afghanistan,5.0,2020-02-24
1,Afghanistan,5.0,2020-02-25
2,Afghanistan,5.0,2020-02-26
3,Afghanistan,5.0,2020-02-27
4,Afghanistan,5.0,2020-02-28
...,...,...,...
255591,Zimbabwe,262324.0,2023-02-02
255592,Zimbabwe,262324.0,2023-02-03
255593,Zimbabwe,262324.0,2023-02-04
255594,Zimbabwe,262324.0,2023-02-05


In [46]:
df_cases['date'] = pd.to_datetime(df_cases['date'])

In [47]:
df_cases

Unnamed: 0,location,total_cases,date
0,Afghanistan,5.0,2020-02-24
1,Afghanistan,5.0,2020-02-25
2,Afghanistan,5.0,2020-02-26
3,Afghanistan,5.0,2020-02-27
4,Afghanistan,5.0,2020-02-28
...,...,...,...
255591,Zimbabwe,262324.0,2023-02-02
255592,Zimbabwe,262324.0,2023-02-03
255593,Zimbabwe,262324.0,2023-02-04
255594,Zimbabwe,262324.0,2023-02-05


In [48]:
df_cases.reset_index(inplace=True)
df_cases = df_cases.drop(columns=['index'])
df_cases

Unnamed: 0,location,total_cases,date
0,Afghanistan,5.0,2020-02-24
1,Afghanistan,5.0,2020-02-25
2,Afghanistan,5.0,2020-02-26
3,Afghanistan,5.0,2020-02-27
4,Afghanistan,5.0,2020-02-28
...,...,...,...
241171,Zimbabwe,262324.0,2023-02-02
241172,Zimbabwe,262324.0,2023-02-03
241173,Zimbabwe,262324.0,2023-02-04
241174,Zimbabwe,262324.0,2023-02-05


In [49]:
df_cases['Month'] = df_cases['date'].dt.month
df_cases['Year'] = df_cases['date'].dt.year

In [50]:
df_cases

Unnamed: 0,location,total_cases,date,Month,Year
0,Afghanistan,5.0,2020-02-24,2,2020
1,Afghanistan,5.0,2020-02-25,2,2020
2,Afghanistan,5.0,2020-02-26,2,2020
3,Afghanistan,5.0,2020-02-27,2,2020
4,Afghanistan,5.0,2020-02-28,2,2020
...,...,...,...,...,...
241171,Zimbabwe,262324.0,2023-02-02,2,2023
241172,Zimbabwe,262324.0,2023-02-03,2,2023
241173,Zimbabwe,262324.0,2023-02-04,2,2023
241174,Zimbabwe,262324.0,2023-02-05,2,2023


## Bucketizing 

In [51]:
mask = (df_cases['date'].dt.day == 1)
print(df_cases.loc[mask])

           location  total_cases       date  Month  Year
6       Afghanistan          5.0 2020-03-01      3  2020
37      Afghanistan        192.0 2020-04-01      4  2020
67      Afghanistan       2171.0 2020-05-01      5  2020
98      Afghanistan      15836.0 2020-06-01      6  2020
128     Afghanistan      31848.0 2020-07-01      7  2020
...             ...          ...        ...    ...   ...
241047     Zimbabwe     257465.0 2022-10-01     10  2022
241078     Zimbabwe     257893.0 2022-11-01     11  2022
241108     Zimbabwe     259164.0 2022-12-01     12  2022
241139     Zimbabwe     259981.0 2023-01-01      1  2023
241170     Zimbabwe     262324.0 2023-02-01      2  2023

[7996 rows x 5 columns]


In [52]:
df_cases = df_cases.loc[mask]
df_cases

Unnamed: 0,location,total_cases,date,Month,Year
6,Afghanistan,5.0,2020-03-01,3,2020
37,Afghanistan,192.0,2020-04-01,4,2020
67,Afghanistan,2171.0,2020-05-01,5,2020
98,Afghanistan,15836.0,2020-06-01,6,2020
128,Afghanistan,31848.0,2020-07-01,7,2020
...,...,...,...,...,...
241047,Zimbabwe,257465.0,2022-10-01,10,2022
241078,Zimbabwe,257893.0,2022-11-01,11,2022
241108,Zimbabwe,259164.0,2022-12-01,12,2022
241139,Zimbabwe,259981.0,2023-01-01,1,2023


In [53]:
df_cases = df_cases.drop(columns=['date'])
df_cases = df_cases.rename({'location': 'Country'},axis=1)
df_cases

Unnamed: 0,Country,total_cases,Month,Year
6,Afghanistan,5.0,3,2020
37,Afghanistan,192.0,4,2020
67,Afghanistan,2171.0,5,2020
98,Afghanistan,15836.0,6,2020
128,Afghanistan,31848.0,7,2020
...,...,...,...,...
241047,Zimbabwe,257465.0,10,2022
241078,Zimbabwe,257893.0,11,2022
241108,Zimbabwe,259164.0,12,2022
241139,Zimbabwe,259981.0,1,2023


# Dataset Integration

In [54]:
new_result = pd.merge(result, df_cases, how="left", on=["Country","Month","Year"])
new_result

Unnamed: 0,Country,Product,Units Sold,Revenue per cookie,Cost per cookie,Date,Month,Month Name,Year,Revenue,Cost,Profit,Population,total_cases
0,Canada,Chocolate Chip,292,5.0,2.00,01-02-2019,2,February,2022,1460.0,584.00,876.00,39110420,3085591.0
1,Mexico,Chocolate Chip,974,5.0,2.00,01-02-2019,2,February,2022,4870.0,1948.00,2922.00,128665641,4985689.0
2,Canada,Chocolate Chip,2518,5.0,2.00,01-06-2019,6,June,2022,12590.0,5036.00,7554.00,39110420,3888672.0
3,Germany,Chocolate Chip,1006,5.0,2.00,01-06-2019,6,June,2022,5030.0,2012.00,3018.00,84270625,26360953.0
4,Germany,Chocolate Chip,367,5.0,2.00,01-07-2019,7,July,2022,1835.0,734.00,1101.00,84270625,28293960.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,France,White Chocolate Macadamia Nut,2826,6.0,2.75,01-05-2019,5,May,2022,16956.0,7771.50,9184.50,68042591,28736093.0
696,France,White Chocolate Macadamia Nut,663,6.0,2.75,01-09-2019,9,September,2022,3978.0,1823.25,2154.75,68042591,34610026.0
697,United States,White Chocolate Macadamia Nut,2574,6.0,2.75,01-11-2018,11,November,2021,15444.0,7078.50,8365.50,334233854,46163124.0
698,United States,White Chocolate Macadamia Nut,2438,6.0,2.75,01-12-2018,12,December,2021,14628.0,6704.50,7923.50,334233854,48743244.0


## Merge Successful?

In [56]:
new_result.isnull().sum()

Country               0
Product               0
Units Sold            0
Revenue per cookie    0
Cost per cookie       0
Date                  0
Month                 0
Month Name            0
Year                  0
Revenue               0
Cost                  0
Profit                0
Population            0
total_cases           0
dtype: int64

# Generate Surrogate Keys

In [57]:
new_result['Surrogate Keys'] = range(1,len(new_result)+1)

In [58]:
new_result

Unnamed: 0,Country,Product,Units Sold,Revenue per cookie,Cost per cookie,Date,Month,Month Name,Year,Revenue,Cost,Profit,Population,total_cases,Surrogate Keys
0,Canada,Chocolate Chip,292,5.0,2.00,01-02-2019,2,February,2022,1460.0,584.00,876.00,39110420,3085591.0,1
1,Mexico,Chocolate Chip,974,5.0,2.00,01-02-2019,2,February,2022,4870.0,1948.00,2922.00,128665641,4985689.0,2
2,Canada,Chocolate Chip,2518,5.0,2.00,01-06-2019,6,June,2022,12590.0,5036.00,7554.00,39110420,3888672.0,3
3,Germany,Chocolate Chip,1006,5.0,2.00,01-06-2019,6,June,2022,5030.0,2012.00,3018.00,84270625,26360953.0,4
4,Germany,Chocolate Chip,367,5.0,2.00,01-07-2019,7,July,2022,1835.0,734.00,1101.00,84270625,28293960.0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,France,White Chocolate Macadamia Nut,2826,6.0,2.75,01-05-2019,5,May,2022,16956.0,7771.50,9184.50,68042591,28736093.0,696
696,France,White Chocolate Macadamia Nut,663,6.0,2.75,01-09-2019,9,September,2022,3978.0,1823.25,2154.75,68042591,34610026.0,697
697,United States,White Chocolate Macadamia Nut,2574,6.0,2.75,01-11-2018,11,November,2021,15444.0,7078.50,8365.50,334233854,46163124.0,698
698,United States,White Chocolate Macadamia Nut,2438,6.0,2.75,01-12-2018,12,December,2021,14628.0,6704.50,7923.50,334233854,48743244.0,699


In [59]:
df = new_result.reindex(columns=['Surrogate Keys'] + list([c for c in new_result.columns if c!= 'Surrogate Keys']))

In [60]:
df

Unnamed: 0,Surrogate Keys,Country,Product,Units Sold,Revenue per cookie,Cost per cookie,Date,Month,Month Name,Year,Revenue,Cost,Profit,Population,total_cases
0,1,Canada,Chocolate Chip,292,5.0,2.00,01-02-2019,2,February,2022,1460.0,584.00,876.00,39110420,3085591.0
1,2,Mexico,Chocolate Chip,974,5.0,2.00,01-02-2019,2,February,2022,4870.0,1948.00,2922.00,128665641,4985689.0
2,3,Canada,Chocolate Chip,2518,5.0,2.00,01-06-2019,6,June,2022,12590.0,5036.00,7554.00,39110420,3888672.0
3,4,Germany,Chocolate Chip,1006,5.0,2.00,01-06-2019,6,June,2022,5030.0,2012.00,3018.00,84270625,26360953.0
4,5,Germany,Chocolate Chip,367,5.0,2.00,01-07-2019,7,July,2022,1835.0,734.00,1101.00,84270625,28293960.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,696,France,White Chocolate Macadamia Nut,2826,6.0,2.75,01-05-2019,5,May,2022,16956.0,7771.50,9184.50,68042591,28736093.0
696,697,France,White Chocolate Macadamia Nut,663,6.0,2.75,01-09-2019,9,September,2022,3978.0,1823.25,2154.75,68042591,34610026.0
697,698,United States,White Chocolate Macadamia Nut,2574,6.0,2.75,01-11-2018,11,November,2021,15444.0,7078.50,8365.50,334233854,46163124.0
698,699,United States,White Chocolate Macadamia Nut,2438,6.0,2.75,01-12-2018,12,December,2021,14628.0,6704.50,7923.50,334233854,48743244.0


# Load

In [61]:
df.to_csv('Stagged_data.csv')

### Different ways to save

![image.png](attachment:image.png)

![image.png](attachment:image.png)