<a href="https://colab.research.google.com/github/alona808/data_viz_plotly/blob/master/data_viz_plotly.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Datacamp course: Introduction to Data Visualization with Plotly in Python
https://app.datacamp.com/learn/courses/introduction-to-data-visualization-with-plotly-in-python

## Import modules

In [1]:
import os
import pandas as pd
import random
import calendar
calendar.setfirstweekday(calendar.SUNDAY)

# visualization
import plotly.express as px
from plotly.subplots import make_subplots
# from plotly.offline import iplot
from plotly.graph_objects import Figure

# connect to google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Additional settings

In [None]:
# !pip install --upgrade jupyterlab plotly

In [2]:
# To display the full text of a pandas DataFrame
pd.set_option('display.max_colwidth', 1)

# To display all columns of DataFrame
pd.set_option('display.max_columns', None)

# To print all the outputs in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# To show warnings only once:
import warnings; warnings.filterwarnings(action='once')

File path to data sets

In [3]:
path_to_raw_data = '/content/drive/My Drive/colab_notebooks/data_viz_plotly/data/'
file_penguins = 'penguins.csv'
file_monthly_sales = 'monthly_sales.csv'
file_AAPL = 'AAPL.csv'
file_rain = 'rain.csv'
file_revenue_data = 'revenue_data.csv'
file_revenue_data2 = 'revenue_data2.csv'
file_world_bank_population = 'world_bank_population.csv'

##Plotly basic bar chart

In [None]:
# Get calendar days
days = ['Sunday', 'Monday', 'Tuesday', 'Wednsday', 'Thursday', 'Friday', 'Saturday']

full_weekdays = list(calendar.day_name)
abbr_weekdays = list(calendar.day_abbr)

# day_names = list(calendar.day_name)
# weekdays = [calendar.day_name[i] for i in range(7)]

days
full_weekdays
abbr_weekdays
# day_names
# weekdays



In [None]:
# Generate a random temperature between 20 and 37 degrees Celsius
min_temp = 18
max_temp = 39
num_tempr = 7

temperature = [
    # random.uniform(min_temp, max_temp) for _ in range(num_tempr)
    random.randint(min_temp, max_temp) for _ in range(num_tempr)
]

temperature

In [None]:
# plot graph
fig = px.bar(
    x=abbr_weekdays
    ,y=temperature
    ,title='Temperatures of week'
)

fig.show()

The plotly figure elements

In [None]:
print(fig)

## <mark>Univariate plots:</mark>
- Bar chart
- Box p;ot
- Density plots

###Load penguins data

In [None]:
penguins = pd.read_csv(os.path.join(path_to_raw_data, file_penguins))
                            #  ,encoding='ISO-8859-1') # or use that: encoding='cp1252'

df_penguins = penguins.copy()

In [None]:
df_penguins.info()
df_penguins.head()

In [None]:
df_penguins.describe()

In [None]:
# df_penguins.columns = df_penguins.columns.str.lower()

In [None]:
# df_penguins.columns = [columns.replace(' ', '_').replace('(', '').replace(')', '') for columns in df_penguins.columns]
# df_penguins.sample(5)

### Histograms
https://plotly.com/python/histograms/

In [None]:
fig = px.histogram(
    df_penguins
    ,x='Body Mass (g)'
    ,nbins=10
    ,title=f'Histogram of Penguins Body Mass (g)'
    # ,orientation='h' ?
    ,histfunc='max'
    ,hover_data=df_penguins.columns
    ,color="Sex"
    ,text_auto=True
)
fig.show()

###Box (and whisker) plots
https://plotly.com/python/box-plots/

In [None]:
df_penguins.columns

In [None]:
fig = px.box(
    df_penguins
    ,y='Flipper Length (mm)'
    ,title=f'Box plot of of Penguins Flipper Length (mm)'
    # ,orientation='h' ?
    ,hover_data=df_penguins.columns
    ,color="Sex"
    ,points="all"
    # ,notched=True, # used notched shape
)
fig.show()

###Specific colors in plotly.express

In [None]:
# plot graph
fig = px.bar(
    x=abbr_weekdays
    ,y=temperature
    ,title='Temperatures of week'
    ,color_discrete_map={
        'Mon': 'rgb(0.0.128)'
        ,'Wed': 'rgb(235,207,52)'
    }
    # ,color=temperature
    ,height=400
    ,width=600
    # ,labels={'temperature': ''} ?
)

fig.show()

In [None]:
# plot graph
fig = px.bar(
    x=abbr_weekdays
    ,y=temperature
    ,title='Temperatures of week'
    ,color=temperature
    ,height=400
    ,width=600
    # ,labels={'temperature': ''} ?
)

fig.show()

In [None]:
my_scale=[
    ('rgb(242,238,10)')
    ,('rgb(242,95,10)')
    ,('rgb(255,0,0)')
]

In [None]:
# plot graph
fig = px.bar(
    x=abbr_weekdays
    ,y=temperature
    ,title='Temperatures of week'
    # ,color_descrete_map={
    #     'Mon': 'rgb(0.0.128)'
    #     ,'Wen': 'rgb(235,207,52)'
    # }
    ,color_continuous_scale=my_scale
    ,color=temperature
    ,height=400
    ,width=600
    # ,labels={'temperature': ''} ?
)

fig.show()

## <mark>Bivariate visualizations</mark>

Some examples:
- Scatterplots
- Line charts
- Correlation plots

In [None]:
df_penguins.columns

In [None]:
df_penguins = df_penguins.astype(
    {'Sample Number': 'int'
    # ,'Date Egg': 'pd.to_datetime('Date Egg')})
    ,'Culmen Length (mm)': 'float'
    ,'Culmen Depth (mm)': 'float'
    ,'Flipper Length (mm)': 'float'
    ,'Body Mass (g)': 'float'

    })


In [None]:
# df_penguins.sample(5)

### Scatter plot

In [None]:
# Create a scatterplot
fig = px.scatter(data_frame=penguins, title="Penguin Culmen Statistics",
    # Set the right columns
    x="Culmen Length (mm)",
    y="Culmen Depth (mm)",
    # We will set some colors
    color="Species",
    # color_discrete_map=color_map
)

# Show your work
fig.show();

##<mark>Customizing Plots</mark>

### Update Axis

In [None]:
_df_penguins = df_penguins.groupby(['Species'])['Flipper Length (mm)'].mean().round(2).reset_index()
_df_penguins

In [None]:
# Create the plot
fig = px.bar(_df_penguins,
             x="Species",
             y="Flipper Length (mm)",
             title="Flipper Length (mm) by Species",
             color="Species")

# Change the axis titles
fig.update_layout(
    xaxis=dict(title=dict(text="Species")),
    yaxis=dict(title=dict(text="Average Flipper Length (mm)"))
)

# Show the plot
fig.show();

### Annotating your savings

Using `add_annotation()` adds a single annotationUsing

`update_layout()` and the annotations argumentAccepts a list of annotation objects

`showarrow = True/False` - used to include an arrowThe arrow can be customized


In [None]:
# Create the first annotation
min_avg = dict(
    x=0, y=200, text="Adelie Penguin has the minimum average length",
    font=dict(color="black"), showarrow=True, arrowhead=4
)

# Create the second annotation
max_avg = dict(
    x=2, y=220, text="Gentoo penguin has the maximum average length",
    font=dict(color="black"), showarrow=True, arrowhead=4,
)

# Add annotations to the figure
fig.update_layout(annotations=[min_avg, max_avg])

# Show the plot
fig.show(); # Add a semicolon here

In [None]:
# Get and format today's date
# today = datetime.today().strftime("%A")

# Create the message_annotation
message_annotation = dict(
  # Set the correct coordinates
  x=0.5, y=0.95, xref="paper", yref="paper",
  # Set the format of the text and box
  text=f"Let's know an average length of penguins :)",
  font=dict(size=20, color="white"),
  bgcolor="rgb(227, 88, 200)", showarrow=False
)

# Update the figure layout and show
fig.update_layout(annotations=[message_annotation])
# fig.update_layout(annotations=[min_avg, max_avg, message_annotation])


fig.show(); # Add a semicolon here

## <mark>Advanced Customization</mark>

### Subplots

In [None]:
revenue_data = pd.read_csv(os.path.join(path_to_raw_data, file_revenue_data))

df_revenue = revenue_data.copy()

In [None]:
df_revenue.head()

In [None]:
# Create a subplot grid
fig = make_subplots(rows=2, cols=1,
                    # Set the subplot titles
                    subplot_titles=['Revenue Histogram', 'Revenue Boxplot'])

# Create plotly express figures
hist = px.histogram(df_revenue, x='Revenue')
box = px.box(df_revenue, x='Revenue')

# Extract traces and add to subplots
fig.add_trace(hist.data[0], row=1, col=1)
fig.add_trace(box.data[0], row=2, col=1)


# Add a plot title
fig.update_layout(dict(title=dict(text='Company Revenue', x=0.5, y=0.9)))

fig.show(); # Add a semicolon here


### Stacked subplots

In [None]:
df_penguins.sample(5)

In [None]:
fig = make_subplots(rows=3, cols=1,
                    shared_xaxes=True,
                    subplot_titles=['Adelie Penguins'  , 'Gentoo Penguins', 'Chinstrap Penguins'])
row_num = 1
for species in ['Adelie', 'Gentoo', 'Chinstrap']:
  # Filter data for this species
  df = df_penguins[df_penguins['Island'] == 'Biscoe']
  scatter = px.scatter(df, x='Culmen Length (mm)'  , y='Culmen Depth (mm)')
  # Add the trace to the subplot
  fig.add_trace(scatter.data[0], row=row_num, col=1)
  row_num +=1
fig.show(); # Add a semicolon here

#   fig = px.scatter(data_frame=penguins, title="Penguin Culmen Statistics",
#     # Set the right columns
#     x="Culmen Length (mm)",
#     y="Culmen Depth (mm)",
#     # We will set some colors
#     color="Species",
#     # color_discrete_map=color_map
# )


### Layering multiple plots

#### Population dataset

In [22]:
world_bank_population = pd.read_csv(os.path.join(path_to_raw_data, file_world_bank_population))

df_population = world_bank_population.copy()

In [23]:
# df_population.describe()
# df_population.info()
df_population.head()

Unnamed: 0,Country Name,Country Code,Series Name,Series Code,1960 [YR1960],1961 [YR1961],1962 [YR1962],1963 [YR1963],1964 [YR1964],1965 [YR1965],1966 [YR1966],1967 [YR1967],1968 [YR1968],1969 [YR1969],1970 [YR1970],1971 [YR1971],1972 [YR1972],1973 [YR1973],1974 [YR1974],1975 [YR1975],1976 [YR1976],1977 [YR1977],1978 [YR1978],1979 [YR1979],1980 [YR1980],1981 [YR1981],1982 [YR1982],1983 [YR1983],1984 [YR1984],1985 [YR1985],1986 [YR1986],1987 [YR1987],1988 [YR1988],1989 [YR1989],1990 [YR1990],1991 [YR1991],1992 [YR1992],1993 [YR1993],1994 [YR1994],1995 [YR1995],1996 [YR1996],1997 [YR1997],1998 [YR1998],1999 [YR1999],2000 [YR2000],2001 [YR2001],2002 [YR2002],2003 [YR2003],2004 [YR2004],2005 [YR2005],2006 [YR2006],2007 [YR2007],2008 [YR2008],2009 [YR2009],2010 [YR2010],2011 [YR2011],2012 [YR2012],2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020],2021 [YR2021],2022 [YR2022],2023 [YR2023],2024 [YR2024],2025 [YR2025],2026 [YR2026],2027 [YR2027],2028 [YR2028],2029 [YR2029],2030 [YR2030],2031 [YR2031],2032 [YR2032],2033 [YR2033],2034 [YR2034],2035 [YR2035],2036 [YR2036],2037 [YR2037],2038 [YR2038],2039 [YR2039],2040 [YR2040],2041 [YR2041],2042 [YR2042],2043 [YR2043],2044 [YR2044],2045 [YR2045],2046 [YR2046],2047 [YR2047],2048 [YR2048],2049 [YR2049],2050 [YR2050]
0,Afghanistan,AFG,"Population, total",SP.POP.TOTL,8996973,9169410,9351441,9543205,9744781,9956320,10174836,10399926,10637063,10893776,11173642,11475445,11791215,12108963,12412950,12689160,12943093,13171306,13341198,13411056,13356511,13171673,12882528,12537730,12204292,11938208,11736179,11604534,11618005,11868877,12412308,13299017,14485546,15816603,17075727,18110657,18853437,19357126,19737765,20170844,20779953,21606988,22600770,23680871,24726684,25654277,26433049,27100536,27722276,28394813,29185507,30117413,31161376,32269589,33370794,34413603,35383128,36296400,37172386,38041754,38928000,39835000,40754000,41681000,42609000,43532000,44449000,45364000,46275000,47185000,48094000,49000000,49904000,50804000,51699000,52587000,53469000,54343000,55209000,56065000,56912000,57748000,58574000,59387000,60188000,60974000,61746000,62503000,63245000,63972000,64683000
1,Albania,ALB,"Population, total",SP.POP.TOTL,1608800,1659800,1711319,1762621,1814135,1864791,1914573,1965598,2022272,2081695,2135479,2187853,2243126,2296752,2350124,2404831,2458526,2513546,2566266,2617832,2671997,2726056,2784278,2843960,2904429,2964762,3022635,3083605,3142336,3227943,3286542,3266790,3247039,3227287,3207536,3187784,3168033,3148281,3128530,3108778,3089027,3060173,3051010,3039616,3026939,3011487,2992547,2970017,2947314,2927519,2913021,2905195,2900401,2895092,2889104,2880703,2876101,2873457,2866376,2854191,2850000,2844000,2837000,2828000,2820000,2811000,2802000,2792000,2782000,2771000,2761000,2749000,2737000,2723000,2709000,2694000,2678000,2660000,2642000,2623000,2603000,2583000,2562000,2542000,2520000,2499000,2477000,2456000,2434000,2412000,2390000
2,Algeria,DZA,"Population, total",SP.POP.TOTL,11057863,11336339,11619828,11912803,12221675,12550885,12902627,13275026,13663583,14061722,14464985,14872250,15285990,15709825,16149025,16607707,17085801,17582904,18102266,18647815,19221665,19824301,20452902,21101875,21763575,22431502,23102389,23774284,24443467,25106190,25758869,26400479,27028326,27635515,28213774,28757785,29266405,29742979,30192754,30623406,31042235,31451514,31855109,32264157,32692163,33149724,33641002,34166972,34730608,35333881,35977455,36661444,37383887,38140132,38923687,39728025,40551404,41389198,42228429,43053054,43851000,44617000,45350000,46053000,46731000,47388000,48022000,48634000,49226000,49801000,50361000,50908000,51444000,51971000,52494000,53016000,53537000,54059000,54583000,55110000,55640000,56174000,56711000,57250000,57789000,58326000,58859000,59388000,59911000,60423000,60923000
3,American Samoa,ASM,"Population, total",SP.POP.TOTL,20123,20602,21253,22034,22854,23672,24462,25248,25989,26703,27363,27984,28567,29100,29596,30052,30456,30838,31269,31845,32646,33701,34968,36412,37946,39519,41119,42740,44343,45894,47347,48685,49896,51020,52095,53161,54211,55221,56171,57053,57821,58494,59080,59504,59681,59562,59107,58365,57492,56683,56079,55759,55667,55713,55791,55812,55741,55620,55465,55312,55000,55000,55000,55000,55000,55000,55000,55000,55000,55000,55000,55000,55000,55000,55000,55000,54000,54000,54000,54000,54000,54000,54000,54000,54000,54000,54000,54000,54000,54000,54000
4,Andorra,AND,"Population, total",SP.POP.TOTL,13411,14375,15370,16412,17469,18549,19647,20758,21890,23058,24276,25559,26892,28232,29520,30705,31777,32771,33737,34818,36067,37500,39114,40867,42706,44600,46517,48455,50434,52448,54509,56671,58888,60971,62677,63850,64360,64327,64142,64370,65390,67341,70049,73182,76244,78867,80993,82684,83862,84463,84449,83747,82427,80774,79213,78011,77297,77001,77006,77142,77000,77000,77000,78000,78000,78000,78000,78000,78000,78000,78000,78000,78000,78000,78000,78000,78000,78000,78000,78000,78000,78000,78000,78000,77000,77000,77000,77000,77000,76000,76000


Remove part of column name

In [24]:
# colomn bames to not change
column_names_not_split = ['Country Name', 'Country Code', 'Series Name',  'Series Code']

# Create the new list of column names by splitting if not in the list of names to keep
new_column_names = []
for col in df_population.columns:
    if col not in column_names_not_split:
        new_column_names.append(col.split(' ')[0])
    else:
        new_column_names.append(col.lower().replace(' ', '_')) # Apply string operations here

df_population.columns = new_column_names

df_population.sample(5)

Unnamed: 0,country_name,country_code,series_name,series_code,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032,2033,2034,2035,2036,2037,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047,2048,2049,2050
174,Northern Mariana Islands,MNP,"Population, total",SP.POP.TOTL,9979,10245,10436,10590,10783,11022,11344,11721,12140,12574,12994,13429,13882,14325,14677,14924,15026,15048,15170,15638,16633,18158,20167,22622,25500,28720,32370,36377,40288,43540,45752,46666,46476,45758,45364,45870,47521,50056,52997,55636,57453,58321,58420,57947,57246,56542,55882,55219,54625,54193,53971,54012,54311,54784,55305,55780,56188,56562,56882,57216,58000,58000,58000,59000,59000,59000,60000,60000,60000,61000,61000,61000,62000,62000,62000,62000,63000,63000,63000,63000,63000,63000,63000,63000,63000,63000,63000,62000,62000,62000,62000
126,Latin America & Caribbean (excluding high income),LAC,"Population, total",SP.POP.TOTL,204132318,209907556,215860314,221965763,228189484,234504556,240902688,247383939,253941681,260571232,267269864,274032331,280858331,287756770,294740512,301818705,308988135,316246897,323606231,331080207,338674603,346391528,354216276,362113464,370037957,377955085,385849520,393720726,401567028,409391897,417195821,424967299,432695338,440382414,448036597,455661646,463259085,470817399,478311486,485702157,492968031,500087474,507060159,513900750,520642949,527308231,533896503,540405229,546848933,553244956,559496784,565882120,572224389,578502021,584679495,590731810,596643917,602426586,608113077,613773128,619401000,624993000,630545000,636026000,641383000,646595000,651648000,656541000,661269000,665830000,670223000,674438000,678475000,682339000,686035000,689579000,692965000,696197000,699274000,702194000,704958000,707572000,710036000,712352000,714521000,716547000,718428000,720170000,721772000,723227000,724533000
122,Kyrgyz Republic,KGZ,"Population, total",SP.POP.TOTL,2172300,2255900,2333400,2413700,2495300,2573300,2655300,2736500,2818300,2894800,2959900,3022300,3088200,3153800,3223900,3292400,3358700,3423900,3487100,3552000,3617400,3685800,3759300,3838300,3916400,3990300,4066500,4144600,4218400,4307500,4391200,4463600,4515400,4516700,4515100,4560400,4628400,4696400,4769000,4840400,4898400,4945100,4990700,5043300,5104700,5162600,5218400,5268400,5318700,5383300,5447900,5514600,5607200,5719600,5835500,5956900,6079500,6198200,6322800,6456900,6565000,6669000,6768000,6865000,6958000,7051000,7143000,7233000,7322000,7411000,7499000,7586000,7674000,7762000,7850000,7939000,8028000,8117000,8206000,8295000,8384000,8472000,8560000,8647000,8732000,8817000,8901000,8983000,9064000,9143000,9221000
238,Turkey,TUR,"Population, total",SP.POP.TOTL,27472345,28146910,28832827,29531365,30244261,30972994,31717507,32477992,33256464,34055390,34876303,35720599,36587261,37472336,38370283,39277258,40189567,41108297,42039992,42994041,43975971,44988414,46025411,47073472,48114155,49133937,50128541,51100924,52053765,52992487,53921760,54840590,55748948,56653804,57564204,58486456,59423282,60372568,61329676,62287397,63240194,64192243,65145367,66089402,67010930,67903469,68756810,69581848,70418604,71321399,72326988,73443863,74653016,75928564,77231907,78529409,79821724,81101892,82319724,83429615,84339000,85043000,85562000,85957000,86316000,86705000,87142000,87613000,88115000,88633000,89158000,89693000,90242000,90797000,91341000,91864000,92362000,92837000,93288000,93719000,94132000,94525000,94898000,95249000,95581000,95892000,96184000,96455000,96705000,96934000,97140000
16,Bangladesh,BGD,"Population, total",SP.POP.TOTL,48013504,49362843,50752157,52202007,53741716,55385112,57157654,59034249,60918454,62679765,64232482,65531633,66625705,67637530,68742233,70066301,71652381,73463584,75450032,77529045,79639491,81767515,83932127,86142495,88416521,90764183,93187603,95671163,98186350,100695497,103171956,105599127,107983704,110350639,112737683,115169930,117649932,120160564,122682815,125189651,127657854,130088702,132478086,134791603,136986432,139035505,140921167,142660376,144304167,145924797,147575430,149273778,151007807,152764676,154520167,156256276,157970840,159670593,161356039,163046161,164689000,166303000,167886000,169432000,170937000,172399000,173814000,175180000,176498000,177769000,178994000,180171000,181300000,182377000,183403000,184374000,185291000,186152000,186960000,187715000,188417000,189066000,189663000,190208000,190701000,191142000,191532000,191871000,192157000,192390000,192568000


Remove rows with `country_code` NULL

In [25]:
df_population.shape
df_population = df_population[~df_population['country_code'].isnull() == True]
df_population.shape

(264, 95)

(259, 95)

In [26]:
df_population.isna().sum().sum()

np.int64(0)

Cleaning the data before converting data types

In [27]:
columns_objects = ['country_name', 'country_code', 'series_name',  'series_code']
mask = ~df_population.columns.isin(columns_objects)
columns_to_change_dtype = df_population.columns[mask]

If you just want to see which values are causing trouble

In [28]:
for col in columns_to_change_dtype:
    bad = df_population[~df_population[col].apply(lambda x: isinstance(x, (int, float, str, type(None))))]
    if not bad.empty:
        print(f"Problematic entries in column {col}:")
        print(bad[col])

In [29]:
bad

Unnamed: 0,country_name,country_code,series_name,series_code,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032,2033,2034,2035,2036,2037,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047,2048,2049,2050


In [31]:
def clean_value(val):
    # Try to convert, otherwise return pd.NA
    try:
        return int(val)
    except (ValueError, TypeError):
        return pd.NA

for col in columns_to_change_dtype:
    df_population[col] = df_population[col].apply(clean_value).astype('Int64')

#### Plot layered charts

In [32]:
df_population.tail()

Unnamed: 0,country_name,country_code,series_name,series_code,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032,2033,2034,2035,2036,2037,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047,2048,2049,2050
254,West Bank and Gaza,PSE,"Population, total",SP.POP.TOTL,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1978248,2068845,2163591,2262676,2366298,2474666,2587997,2706518,2776568,2848431,2922153,2997784,3075373,3154969,3236626,3320396,3406334,3494496,3591977,3689099,3786161,3882986,3979998,4076708,4173398,4270092,4367088,4454805,4569087.0,4685306,4800000,4916000,5031000,5148000,5264000,5380000,5497000,5615000,5732000,5850000,5968000,6086000,6204000,6323000,6442000,6560000,6679000,6798000,6916000,7034000,7152000,7269000,7386000,7502000,7617000,7732000,7846000,7959000,8072000,8183000,8294000
255,World,WLD,"Population, total",SP.POP.TOTL,3031437775.0,3072480998.0,3125456671.0,3190564032.0,3256064767.0,3322973367.0,3393031801.0,3462460201.0,3532826854.0,3607499991.0,3682911039.0,3760509002.0,3836892580.0,3912347640.0,3988478324.0,4062864562.0,4135418002.0,4207766711.0,4281312782.0,4356746035.0,4432925590.0,4511137170.0,4592341169.0,4674266079.0,4755914211.0,4839074592.0,4924736807.0,5012555212.0,5101297281.0,5189996799.0,5280076284,5368065408,5452349932,5537511534,5621787194,5706689090,5789623839,5872254361,5954005524,6034491778,6114332536,6193671787,6272752974,6351882361,6431551644,6511748367,6592734542,6674203658,6756917904,6839574284,6921871614,7002860604,7085763408,7169640142,7254228377,7338964960,7424282488,7509065705,,7673533972,7753402000,7832616000,7910901000,7988218000,8064538000,8139909000,8214269000,8287618000,8359941000,8431223000,8501464000,8570653000,8638767000,8705772000,8771680000,8836518000,8900288000,8963040000,9024750000,9085413000,9145007000,9203505000,9260895000,9317139000,9372199000,9426057000,9478673000,9530075000,9580243000,9629176000,9676527000
256,"Yemen, Rep.",YEM,"Population, total",SP.POP.TOTL,5315355.0,5393036.0,5473671.0,5556766.0,5641597.0,5727751.0,5816247.0,5907874.0,6001852.0,6097035.0,6193384.0,6290365.0,6390574.0,6500816.0,6629999.0,6784695.0,6967941.0,7178675.0,7414158.0,7669694.0,7941898.0,8231910.0,8541605.0,8869370.0,9213084.0,9572175.0,9941109.0,10322043.0,10730862.0,11189177.0,11709993,12302124,12954155,13634076,14297613,14913315,15469274,15975668,16450310,16921149,17409072,17918373,18443691,18985000,19540098,20107409,20687646,21282515,21892146,22516460,23154855,23807588,24473178,25147109,25823485,26497889,27168210,27834821,28498687.0,29161922,29826000,30491000,31155000,31818000,32480000,33140000,33798000,34454000,35107000,35758000,36407000,37053000,37696000,38336000,38972000,39603000,40229000,40850000,41464000,42071000,42670000,43260000,43842000,44413000,44974000,45523000,46060000,46585000,47096000,47595000,48080000
257,Zambia,ZMB,"Population, total",SP.POP.TOTL,3070776.0,3164329.0,3260650.0,3360104.0,3463213.0,3570464.0,3681955.0,3797873.0,3918872.0,4045740.0,4179067.0,4319224.0,4466174.0,4619546.0,4778724.0,4943283.0,5112823.0,5287548.0,5468262.0,5656139.0,5851825.0,6055366.0,6265864.0,6481916.0,6701540.0,6923149.0,7146969.0,7372837.0,7598275.0,7820205.0,8036845,8246656,8451347,8656486,8869740,9096607,9339733,9597609,9866476,10140561,10415944,10692193,10971698,11256743,11550642,11856247,12173514,12502958,12848530,13215139,13605984,14023193,14465121,14926504,15399753,15879361,16363507,16853688,17351822.0,17861030,18384000,18921000,19470000,20033000,20608000,21197000,21798000,22412000,23039000,23677000,24326000,24985000,25655000,26335000,27024000,27722000,28429000,29144000,29867000,30599000,31338000,32086000,32841000,33604000,34373000,35149000,35931000,36719000,37514000,38314000,39121000
258,Zimbabwe,ZWE,"Population, total",SP.POP.TOTL,3776681.0,3905034.0,4039201.0,4178726.0,4322861.0,4471177.0,4623351.0,4779827.0,4941906.0,5111337.0,5289303.0,5476982.0,5673911.0,5877726.0,6085074.0,6293875.0,6502569.0,6712827.0,6929664.0,7160023.0,7408624.0,7675591.0,7958241.0,8254747.0,8562249.0,8877489.0,9200149.0,9527203.0,9849125.0,10153852.0,10432421,10680995,10900502,11092766,11261744,11410714,11541217,11653242,11747072,11822719,11881477,11923914,11954290,11982224,12019912,12076699,12155491,12255922,12379549,12526968,12697723,12894316,13115131,13350356,13586681,13814629,14030390,14236745,14439018.0,14645468,14863000,15092000,15331000,15581000,15841000,16110000,16390000,16680000,16979000,17285000,17596000,17912000,18233000,18557000,18883000,19212000,19542000,19873000,20205000,20535000,20864000,21190000,21514000,21835000,22152000,22465000,22772000,23075000,23372000,23663000,23948000


In [69]:
# df_population['country_name'].to_list()

In [54]:
df_north_am_population = df_population[df_population['country_name'].isin(['North America'])][columns_to_change_dtype].sum().reset_index()
df_north_am_population.columns = ['year', 'population']
df_north_am_population.head()

df_eu_population = df_population[df_population['country_name'].isin(['European Union'])][columns_to_change_dtype].sum().reset_index()
df_eu_population.columns = ['year', 'population']
df_eu_population.head()

Unnamed: 0,year,population
0,1960,198624409
1,1961,202007500
2,1962,205198600
3,1963,208253700
4,1964,211262900


Unnamed: 0,year,population
0,1960,356906076
1,1961,359998418
2,1962,363200473
3,1963,366516491
4,1964,369850244


In [66]:
# Combine traces from both figures
layered_fig = Figure()

bar_fig = px.bar(df_north_am_population,
                 x='year',
                 y='population')
line_fig = px.line(df_eu_population,
                   x='year',
                   y='population',
                   color_discrete_sequence=['red'])

layered_fig = Figure(
    data=[*bar_fig.data, *line_fig.data]
)
layered_fig.show();

In [62]:
df_poor_population = df_population[df_population['country_name'].isin(['Heavily indebted poor countries (HIPC)'])][columns_to_change_dtype].sum().reset_index()
df_poor_population.columns = ['year', 'population']
df_poor_population.head()

Unnamed: 0,year,population
0,1960,161734357
1,1961,165573152
2,1962,169567094
3,1963,173722855
4,1964,178048171


In [74]:
line_fig2 = px.line(df_poor_population,
                   x='year',
                   y='population',
                   color_discrete_sequence=['green'])


layered_fig = Figure(
    data=[*bar_fig.data, *line_fig.data, *line_fig2.data]
)
layered_fig.show();

Variant to add additional plot to Figure

In [76]:
layered_fig = Figure(
    data=[*bar_fig.data, *line_fig.data]
)

layered_fig.add_trace(line_fig2.data[0])

#  add annotation - ?
current_year = dict(
    x=2025, y=200000, text="Current year: 2025",
    font=dict(color="black"), showarrow=True, arrowhead=4
)

# Add a title
layered_fig.update_layout(title='Population of Noth America vs EU vs Poor Contries')

layered_fig.show();


###<mark>Time Buttons</mark>

#### Load the `rain` dataset

In [106]:
rain = pd.read_csv(os.path.join(path_to_raw_data, file_rain))

df_rain = rain.copy()

In [107]:
df_rain.head()
df_rain.info()
df_rain.describe()

Unnamed: 0,Date,Rainfall
0,1/1/20,0.0
1,2/1/20,0.2
2,3/1/20,0.0
3,4/1/20,0.0
4,5/1/20,0.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305 entries, 0 to 304
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Date      305 non-null    object 
 1   Rainfall  305 non-null    float64
dtypes: float64(1), object(1)
memory usage: 4.9+ KB


Unnamed: 0,Rainfall
count,305.0
mean,4.43082
std,14.993195
min,0.0
25%,0.0
50%,0.0
75%,2.2
max,187.0


In [108]:
df_rain['Date'] = pd.to_datetime(df_rain['Date']).dt.date


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



In [115]:
df_rain.head()
df_rain.sample(5)
df_rain.tail()

df_rain['Date'].min()
df_rain['Date'].max()

Unnamed: 0,Date,Rainfall
0,2020-01-01,0.0
1,2020-02-01,0.2
2,2020-03-01,0.0
3,2020-04-01,0.0
4,2020-05-01,0.0


Unnamed: 0,Date,Rainfall
128,2020-08-05,0.0
274,2020-01-10,3.2
70,2020-11-03,0.5
233,2020-08-21,0.0
199,2020-07-18,1.3


Unnamed: 0,Date,Rainfall
300,2020-10-27,13.6
301,2020-10-28,0.0
302,2020-10-29,15.3
303,2020-10-30,0.0
304,2020-10-31,2.8


datetime.date(2020, 1, 1)

datetime.date(2020, 12, 10)

In [112]:
df_rain.isna().sum()

Unnamed: 0,0
Date,0
Rainfall,0


In [110]:
date_buttons = [
    {'count': 48, 'label': "48HR", 'step': "hour", 'stepmode': "todate"},
    {'count': 28, 'label': "4WTD", 'step': "day", 'stepmode': "todate"},
    {'count': 1, 'label': "YTD", 'step': "year", 'stepmode': "todate"}]

In [111]:
# Create the line chart
fig = px.line(df_rain, x='Date', y='Rainfall',
              title= "Rainfall (mm)")

# Add the buttons
fig.update_layout(
    xaxis=dict(
        rangeselector=dict(
            buttons=date_buttons
        ),
        type='date'  # Explicitly set axis type
    )
)

fig.show();

In [None]:
# penguins = pd.read_csv(os.path.join(path_to_raw_data, file_penguins))
#                             #  ,encoding='ISO-8859-1') # or use that: encoding='cp1252'

# df_penguins = penguins.copy()

## End