In [None]:

import pandas as pd

try:
  df = pd.read_csv('gpd_v2_20220427.csv')
  print(df)
except FileNotFoundError:
  print("Error: 'gdp_v2_20220427.csv' not found. Please upload the file to your Colab environment.")
except Exception as e:
  print(f"An error occurred: {e}")

                            merging_variable    country          leader  \
0             Albania_Berisha_Campaign_2.txt    Albania    Sali Berisha   
1             Albania_Berisha_Campaign_2.txt    Albania    Sali Berisha   
2             Albania_Berisha_Campaign_2.txt    Albania    Sali Berisha   
3             Albania_Berisha_Campaign_2.txt    Albania    Sali Berisha   
4               Albania_Berisha_Ribbon_2.txt    Albania    Sali Berisha   
...                                      ...        ...             ...   
4955  Australia_Morrison_International_1.txt  Australia  Scott Morrison   
4956         Australia_Morrison_Ribbon_1.txt  Australia  Scott Morrison   
4957         Australia_Morrison_Ribbon_1.txt  Australia  Scott Morrison   
4958         Australia_Morrison_Ribbon_1.txt  Australia  Scott Morrison   
4959         Australia_Morrison_Ribbon_1.txt  Australia  Scott Morrison   

                            party   lr  president  term startofterm  \
0     Democratic Party of Al

In [24]:
import pandas as pd
import altair as alt
from sklearn.linear_model import LinearRegression
import numpy as np

# Define color constants from the palette
PRIMARY_COLOR = '#000080'  # Navy blue
WHITE = '#ffffff'
LIGHT_GRAY = '#f5f5f5'
DARK_GRAY = '#333333'

# Create the base chart with tooltip encoding
base = alt.Chart(combined_df).encode(
    x=alt.X('year:Q',
            scale=alt.Scale(domain=[START_YEAR, END_YEAR + PREDICTION_YEARS]),
            axis=alt.Axis(
                tickCount=15,
                grid=True,
                gridColor=LIGHT_GRAY,
                gridWidth=1,
                tickColor=DARK_GRAY,
                domainColor=DARK_GRAY,
                format='d',
                title='Year'
            )),
    tooltip=[
        alt.Tooltip('year:Q', title='Year', format='d'),
        alt.Tooltip('average_populism:Q', title='Populism Score', format='.2f'),
        alt.Tooltip('type:N', title='Data Type')
    ]
)

# Create confidence interval area
confidence_area = alt.Chart(pd.concat([confidence_start, trend_df])).mark_area(
    opacity=0.2,
    color=PRIMARY_COLOR
).encode(
    x=alt.X('year:Q'),
    y='lower_bound:Q',
    y2='upper_bound:Q',
    tooltip=[
        alt.Tooltip('year:Q', title='Year', format='d'),
        alt.Tooltip('lower_bound:Q', title='Lower Bound', format='.2f'),
        alt.Tooltip('upper_bound:Q', title='Upper Bound', format='.2f')
    ]
)

# Create historical line
historical_line = base.mark_line(
    color=PRIMARY_COLOR,
    strokeWidth=2.5
).encode(
    y=alt.Y('average_populism:Q',
            axis=alt.Axis(
                grid=True,
                gridColor=LIGHT_GRAY,
                gridWidth=1,
                tickColor=DARK_GRAY,
                domainColor=DARK_GRAY,
                title='Average Populism Score'
            ))
).transform_filter(
    alt.datum.type == 'Historical'
)

# Create prediction line
prediction_line = alt.Chart(trend_df).mark_line(
    strokeDash=[6, 4],
    color=PRIMARY_COLOR,
    strokeWidth=2.5
).encode(
    x=alt.X('year:Q',
            scale=alt.Scale(domain=[START_YEAR, END_YEAR + PREDICTION_YEARS])),
    y='average_populism:Q',
    tooltip=[
        alt.Tooltip('year:Q', title='Year', format='d'),
        alt.Tooltip('average_populism:Q', title='Predicted Score', format='.2f')
    ]
)

# Create points layer
points = base.mark_point(
    color=PRIMARY_COLOR,
    fill=WHITE,
    size=100,
    stroke=PRIMARY_COLOR,
    strokeWidth=2
).encode(
    y='average_populism:Q',
    tooltip=[
        alt.Tooltip('year:Q', title='Year', format='d'),
        alt.Tooltip('average_populism:Q', title='Populism Score', format='.2f'),
        alt.Tooltip('type:N', title='Data Type')
    ]
).transform_filter(
    alt.datum.type == 'Historical'
)

# Add source text
source = alt.Chart(pd.DataFrame({
    'text': ['Source: Hawkins et al. (2019) Global Populism Database, Harvard Dataverse, V2']
})).mark_text(
    align='left',
    baseline='bottom',
    fontSize=9,
    color=DARK_GRAY,
    dx=5,
    dy=-5
).encode(
    text='text:N'
)

# Combine all layers
final_chart = alt.vconcat(
    (confidence_area + historical_line + prediction_line + points).properties(
        width=800,
        height=400,
        title={
            'text': 'Evolution of Populist Discourse in OECD Countries with 5-Year Projection',
            'subtitle': 'Based on speeches from national leaders (2000 - 2022), with prediction interval',
            'color': DARK_GRAY,
            'subtitleColor': DARK_GRAY,
            'fontSize': 14,
            'subtitleFontSize': 11,
            'font': 'Helvetica',
            'fontWeight': 'bold',
            'subtitleFontWeight': 'normal',
            'anchor': 'start'
        }
    ),
    source,
    spacing=5
).properties(
    background=LIGHT_GRAY
).configure_view(
    strokeWidth=0
).configure_axis(
    labelColor=DARK_GRAY,
    titleColor=DARK_GRAY,
    labelFontSize=11,
    titleFontSize=11
)

# Display the chart
final_chart

NameError: name 'combined_df' is not defined

In [None]:
# Save the chart as JSON
final_chart.save('project_chart1.json')

*CHART 2*

In [25]:
# Create a summary for each country showing date ranges
summary_data = []

for country in sorted(df['country'].unique()):
    country_data = df[df['country'] == country]

    # Get all unique years where we have data
    years = set()
    for _, row in country_data.iterrows():
        start = row['yearbegin']
        end = 2022 if row['yearend'] == 'present' else row['yearend']
        years.update(range(start, end + 1))

    # Sort years and find gaps
    years = sorted(list(years))

    summary_data.append({
        'country': country,
        'year_coverage': years,
        'start_year': min(years) if years else None,
        'end_year': max(years) if years else None,
        'n_years': len(years),
        'years': ', '.join(map(str, years))
    })

# Convert to DataFrame and sort by country
summary_df = pd.DataFrame(summary_data)
summary_df = summary_df.sort_values('country')

# Print the summary
print("Date coverage by country:")
print("-" * 80)
for _, row in summary_df.iterrows():
    print(f"\n{row['country']}:")
    print(f"Coverage: {row['start_year']} to {row['end_year']} ({row['n_years']} years)")
    print(f"Years with data: {row['years']}")
print("\nRecommendation for analysis:")
print("-" * 80)

# Find years with most coverage
year_counts = {}
for row in summary_data:
    for year in row['year_coverage']:
        year_counts[year] = year_counts.get(year, 0) + 1

# Sort years by coverage
sorted_years = sorted(year_counts.items(), key=lambda x: (x[1], x[0]), reverse=True)

print("\nYear coverage across countries:")
for year, count in sorted_years:
    print(f"Year {year}: {count} countries")

# Find best years for comparison
print("\nBest years for comparison:")
top_years = sorted([year for year, count in sorted_years if count >= len(summary_df) * 0.8])
if len(top_years) >= 2:
    print(f"Suggestion: Compare {top_years[0]} with {top_years[-1]} (these years have data for most countries)")
else:
    print("No years have data for >80% of countries. Consider using years with most coverage:")
    top_5_years = sorted([year for year, count in sorted_years[:5]])
    print(f"Years with most coverage: {', '.join(map(str, top_5_years))}")

NameError: name 'df' is not defined

In [None]:
final_chart.save('Project_chart2.json')

*CHART 2B: most growing countries*

In [26]:
# First create your chart and store it in a variable
chart = alt.Chart(
    alt.topo_feature('https://vega.github.io/vega-datasets/data/world-110m.json', 'countries')
).mark_geoshape(
    stroke='white',
    strokeWidth=0.5
).encode(
    color=alt.condition(
        alt.datum.id != '',
        alt.Color(
            'absolute_change:Q',
            scale=alt.Scale(
                domain=[growth_df['absolute_change'].min(), 0, growth_df['absolute_change'].max()],
                range=['#003C7D', '#ffffbf', '#B91D1D']
            ),
            legend=alt.Legend(
                title='Change in Populism Score',
                orient='right',
                gradientLength=400
            )
        ),
        alt.value('#F0F0F0')
    ),
    tooltip=[
        alt.Tooltip('country:N', title='Country'),
        alt.Tooltip('absolute_change:Q', title='Absolute Change', format='.3f'),
        alt.Tooltip('percent_change:Q', title='Percent Change', format='.1f'),
        alt.Tooltip('start_score:Q', title='Initial Score', format='.3f'),
        alt.Tooltip('end_score:Q', title='Final Score', format='.3f'),
        alt.Tooltip('start_year:Q', title='Start Year', format='d'),
        alt.Tooltip('end_year:Q', title='End Year', format='d')
    ]
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(
        data=growth_df,
        key='id',
        fields=['country', 'absolute_change', 'percent_change', 'start_score', 'end_score', 'start_year', 'end_year']
    )
).project(
    type='equirectangular'
).properties(
    width=1920,
    height=1080,
    title={
        'text': 'Change in Populist Discourse Scores in OECD Countries (2000-2022)',
        'subtitle': [
            f'Shows absolute change in populism scores between 2000 and 2022',
            'Red indicates increase, blue indicates decrease in populism score'
        ],
        'color': '#333333',
        'subtitleColor': '#666666',
        'fontSize': 24,
        'subtitleFontSize': 18,
        'font': 'Helvetica',
        'fontWeight': 'bold',
        'subtitleFontWeight': 'normal',
        'anchor': 'start'
    }
).configure_view(
    strokeWidth=0
).configure(
    background='#F8F9FA'
)

NameError: name 'growth_df' is not defined

In [None]:
chart

In [None]:
# Save the chart specification to a JSON file
with open('project_chart5.json', 'w') as f:
    f.write(chart.to_json())

print("Chart specification has been saved to 'project_chart5.json'")

Chart specification has been saved to 'project_chart5.json'


In [None]:
import pandas as pd
import json

# Dictionary mapping countries to their ISO3 codes
iso3_mapping = {
    'Austria': 'AUT',
    'Chile': 'CHL',
    'Colombia': 'COL',
    'Costa Rica': 'CRI',
    'Germany': 'DEU',
    'Hungary': 'HUN',
    'Spain': 'ESP',
    'Australia': 'AUS',
    'Canada': 'CAN',
    'Mexico': 'MEX',
    'Turkey': 'TUR',
    'United States': 'USA',
    'Greece': 'GRC',
    'Italy': 'ITA',
    'Netherlands': 'NLD',
    'Poland': 'POL',
    'Sweden': 'SWE',
    'France': 'FRA',
    'Ireland': 'IRL',
    'Switzerland': 'CHE',
    'Norway': 'NOR',
    'Lithuania': 'LTU',
    'Finland': 'FIN',
    'Estonia': 'EST',
    'Japan': 'JPN',
    'Slovenia': 'SVN',
    'Latvia': 'LVA'
}

# Dictionary mapping ISO3 codes to numeric IDs
iso_to_id = {
    'AUT': 40,
    'CHL': 152,
    'COL': 170,
    'CRI': 188,
    'DEU': 276,
    'HUN': 348,
    'ESP': 724,
    'AUS': 36,
    'CAN': 124,
    'MEX': 484,
    'TUR': 792,
    'USA': 840,
    'GRC': 300,
    'ITA': 380,
    'NLD': 528,
    'POL': 616,
    'SWE': 752,
    'FRA': 250,
    'IRL': 372,
    'CHE': 756,
    'NOR': 578,
    'LTU': 440,
    'FIN': 246,
    'EST': 233,
    'JPN': 392,
    'SVN': 705,
    'LVA': 428
}

# Read the JSON file and create DataFrame
with open('project_chart2_data.json', 'r') as file:
    data = json.load(file)
scores_df = pd.DataFrame(data)

# Calculate absolute differences
def calculate_absolute_difference(group):
    # Sort by year to ensure correct order
    group = group.sort_values('year')
    # Get first and last scores
    first_score = group['score'].iloc[0]
    last_score = group['score'].iloc[-1]

    # Calculate absolute difference
    return last_score - first_score

# Group by country and calculate absolute differences
differences = scores_df.groupby('country').apply(calculate_absolute_difference)

# Create the growth_df DataFrame
growth_df = pd.DataFrame({
    'country': differences.index,
    'absolute_change': differences.values
})

# Add ISO3 codes and numeric IDs
growth_df['iso3'] = growth_df['country'].map(iso3_mapping)
growth_df['id'] = growth_df['iso3'].map(iso_to_id)

# Sort by absolute change in descending order
growth_df = growth_df.sort_values('absolute_change', ascending=False)

# Reset index
growth_df = growth_df.reset_index(drop=True)

# Display the results
print("Absolute changes by country with ISO3 codes:")
print(growth_df)

# Save to JSON file
growth_df.to_json('project_chart2_data2.json', orient='records')

Absolute changes by country with ISO3 codes:
          country  absolute_change iso3   id
0          Turkey         1.341667  TUR  792
1          Mexico         0.712500  MEX  484
2   United States         0.566667  USA  840
3         Hungary         0.450000  HUN  348
4      Costa Rica         0.316667  CRI  188
5     Switzerland         0.300000  CHE  756
6           Spain         0.175000  ESP  724
7          Poland         0.175000  POL  616
8       Lithuania         0.108333  LTU  440
9          Sweden         0.100000  SWE  752
10         France         0.100000  FRA  250
11         Canada         0.087500  CAN  124
12         Norway         0.075000  NOR  578
13          Chile         0.066667  CHL  152
14        Ireland         0.062500  IRL  372
15       Colombia         0.037500  COL  170
16        Germany         0.037500  DEU  276
17        Finland         0.000000  FIN  246
18        Estonia         0.000000  EST  233
19      Australia        -0.006250  AUS   36
20    Neth

  differences = scores_df.groupby('country').apply(calculate_absolute_difference)


In [None]:
# Convert the DataFrame to JSON and save it
growth_df.to_json('project_chart2_data2.json', orient='records')

*CHART 3*

In [23]:
# Try using csv module first to properly handle the complex quoting
import csv
import io

# Read the file first
with open('OECD_GDP__per_capita_growth.csv', 'r', encoding='utf-8-sig') as file:
    # Create a CSV reader that handles the double-quoted fields
    reader = csv.reader(file, quoting=csv.QUOTE_MINIMAL)
    # Convert to list to inspect
    data = list(reader)
    # Convert to DataFrame
    df_growth = pd.DataFrame(data[1:], columns=data[0])

# Alternatively, we can try pandas with very specific parameters:
df_growth = pd.read_csv('OECD_GDP__per_capita_growth.csv',
                       encoding='utf-8-sig',
                       sep=',',
                       quoting=csv.QUOTE_MINIMAL,
                       quotechar='"',
                       escapechar=None,
                       doublequote=True)

print("Columns:", df_growth.columns)
print("\nShape:", df_growth.shape)
print("\nFirst few rows:")
print(df_growth.head())

Columns: Index(['"Country Name","Country Code","Indicator Name","Indicator Code","1960","1961","1962","1963","1964","1965","1966","1967","1968","1969","1970","1971","1972","1973","1974","1975","1976","1977","1978","1979","1980","1981","1982","1983","1984","1985","1986","1987","1988","1989","1990","1991","1992","1993","1994","1995","1996","1997","1998","1999","2000","2001","2002","2003","2004","2005","2006","2007","2008","2009","2010","2011","2012","2013","2014","2015","2016","2017","2018","2019","2020","2021","2022","2023",'], dtype='object')

Shape: (1, 1)

First few rows:
  "Country Name","Country Code","Indicator Name","Indicator Code","1960","1961","1962","1963","1964","1965","1966","1967","1968","1969","1970","1971","1972","1973","1974","1975","1976","1977","1978","1979","1980","1981","1982","1983","1984","1985","1986","1987","1988","1989","1990","1991","1992","1993","1994","1995","1996","1997","1998","1999","2000","2001","2002","2003","2004","2005","2006","2007","2008","2009","20

In [16]:
# Check raw file content
with open('OECD_GDP__per_capita_growth.csv', 'r') as file:
    print(file.readline())  # Print first line
    print(file.readline())  # Print second line

﻿"Country Name,""Country Code"",""Indicator Name"",""Indicator Code"",""1960"",""1961"",""1962"",""1963"",""1964"",""1965"",""1966"",""1967"",""1968"",""1969"",""1970"",""1971"",""1972"",""1973"",""1974"",""1975"",""1976"",""1977"",""1978"",""1979"",""1980"",""1981"",""1982"",""1983"",""1984"",""1985"",""1986"",""1987"",""1988"",""1989"",""1990"",""1991"",""1992"",""1993"",""1994"",""1995"",""1996"",""1997"",""1998"",""1999"",""2000"",""2001"",""2002"",""2003"",""2004"",""2005"",""2006"",""2007"",""2008"",""2009"",""2010"",""2011"",""2012"",""2013"",""2014"",""2015"",""2016"",""2017"",""2018"",""2019"",""2020"",""2021"",""2022"",""2023"","

"OECD members,""OED"",""GDP per capita growth (annual %)"",""NY.GDP.PCAP.KD.ZG"","""",""2.97260545456533"",""4.10172340687301"",""3.66571611082334"",""5.00965800835016"",""4.10037946705832"",""4.25981086092244"",""2.58689401941494"",""4.72823463374219"",""4.31897960912985"",""1.61912006358003"",""2.41289688488948"",""4.16356982381225"",""4.885097162

In [None]:
import pandas as pd
import altair as alt

try:
  df_gov = pd.read_csv('2d540c87-490f-487f-a55f-ad766f804aa1_Data.csv')
  print(df_gov)
except FileNotFoundError:
  print("Error: '2d540c87-490f-487f-a55f-ad766f804aa1_Data.csv' not found. Please upload the file to your Colab environment.")
except Exception as e:
  print(f"An error occurred: {e}")

    Country Name Country Code                               Series Name  \
0    Afghanistan          AFG        Government Effectiveness: Estimate   
1    Afghanistan          AFG  Government Effectiveness: Standard Error   
2        Albania          ALB        Government Effectiveness: Estimate   
3        Albania          ALB  Government Effectiveness: Standard Error   
4        Algeria          DZA        Government Effectiveness: Estimate   
..           ...          ...                                       ...   
423   Yemen Rep.          YEM  Government Effectiveness: Standard Error   
424       Zambia          ZMB        Government Effectiveness: Estimate   
425       Zambia          ZMB  Government Effectiveness: Standard Error   
426     Zimbabwe          ZWE        Government Effectiveness: Estimate   
427     Zimbabwe          ZWE  Government Effectiveness: Standard Error   

    Series Code       2000 [YR2000]       2002 [YR2002]       2003 [YR2003]  \
0        GE.EST   -2

In [None]:
# prompt: read gpd_v2_20220427.csv in new df

try:
  df_new = pd.read_csv('gpd_v2_20220427.csv')
  print(df_new)
except FileNotFoundError:
  print("Error: 'gpd_v2_20220427.csv' not found. Please upload the file to your Colab environment.")
except Exception as e:
  print(f"An error occurred: {e}")

                            merging_variable    country          leader  \
0             Albania_Berisha_Campaign_2.txt    Albania    Sali Berisha   
1             Albania_Berisha_Campaign_2.txt    Albania    Sali Berisha   
2             Albania_Berisha_Campaign_2.txt    Albania    Sali Berisha   
3             Albania_Berisha_Campaign_2.txt    Albania    Sali Berisha   
4               Albania_Berisha_Ribbon_2.txt    Albania    Sali Berisha   
...                                      ...        ...             ...   
4955  Australia_Morrison_International_1.txt  Australia  Scott Morrison   
4956         Australia_Morrison_Ribbon_1.txt  Australia  Scott Morrison   
4957         Australia_Morrison_Ribbon_1.txt  Australia  Scott Morrison   
4958         Australia_Morrison_Ribbon_1.txt  Australia  Scott Morrison   
4959         Australia_Morrison_Ribbon_1.txt  Australia  Scott Morrison   

                            party   lr  president  term startofterm  \
0     Democratic Party of Al

In [None]:
import altair as alt
import pandas as pd
import numpy as np

# Create country_data from df_new, using averagerubric for populism scores
country_data = df_new.groupby(['country', 'yearbegin'])['averagerubric'].mean().reset_index()
country_data = country_data.rename(columns={
    'country': 'Country Name',
    'yearbegin': 'Year',
    'averagerubric': 'Populism_Score'
})

# Rename columns to just the year
df_gov = df_gov.rename(columns={col: col.split(' ')[0] for col in df_gov.columns if ' [YR' in col})

df_gov_melted = pd.melt(
    df_gov,
    id_vars=['Country Name'],
    value_vars=[str(year) for year in range(2000, 2023) if year != 2001],  # Exclude 2001
    var_name='Year',
    value_name='Government_Effectiveness'
)

# Convert 'Year' column to numeric
df_gov_melted['Year'] = df_gov_melted['Year'].astype(int)

# Convert Government_Effectiveness and Populism_Score to numeric
df_gov_melted['Government_Effectiveness'] = pd.to_numeric(df_gov_melted['Government_Effectiveness'], errors='coerce')
country_data['Populism_Score'] = pd.to_numeric(country_data['Populism_Score'], errors='coerce')

# Merge with government effectiveness data
merged_df = pd.merge(
    df_gov_melted,
    country_data,
    on=['Country Name', 'Year'],
    how='inner'
)

# Drop any NaN values
merged_df = merged_df.dropna(subset=['Populism_Score', 'Government_Effectiveness'])

# Create the scatter plot with updated styling
scatter = alt.Chart(merged_df).mark_circle(size=60).encode(
    x=alt.X('Populism_Score:Q',
            title='Populism Score',
            scale=alt.Scale(zero=False)),
    y=alt.Y('Government_Effectiveness:Q',
            title='Government Effectiveness',
            scale=alt.Scale(zero=False)),
    color=alt.Color('Country Name:N',
                   scale=alt.Scale(scheme='tableau10'),
                   legend=None),  # Remove the legend
    tooltip=[
        alt.Tooltip('Country Name:N', title='Country'),
        alt.Tooltip('Year:Q', title='Year'),
        alt.Tooltip('Populism_Score:Q', title='Populism Score', format='.2f'),
        alt.Tooltip('Government_Effectiveness:Q', title='Government Effectiveness', format='.2f')
    ]
).properties(
    width=800,
    height=500,
    title={
        'text': 'Populism Score vs Government Effectiveness',
        'subtitle': [
            'Analysis of OECD Countries (2000-2022)',
            'Sources: World Bank; Hawkins et al. (2019)'  # Added sources as second subtitle line
        ],
        'anchor': 'start',
        'fontSize': 16,
        'subtitleFontSize': 14,
        'subtitleColor': '#666666'
    }
)

# Add a trend line with updated styling
regression = alt.Chart(merged_df).transform_regression(
    'Populism_Score',
    'Government_Effectiveness',
    method='linear'
).mark_line(
    color='#35978f',
    strokeDash=[5,5]
).encode(
    x='Populism_Score:Q',
    y='Government_Effectiveness:Q'
)

# Combine scatter plot and trend line with configuration
chart = (scatter + regression).configure_view(
    stroke='transparent'
).configure_axis(
    labelFontSize=12,
    titleFontSize=14
).configure_legend(
    labelFontSize=12,
    titleFontSize=14
).interactive()

# Display the chart
chart.show()

# Save to JSON
chart.save('Project_chart3.json')

# Print summary statistics
print("\nCorrelation between Populism Score and Government Effectiveness:")
print(merged_df['Populism_Score'].corr(merged_df['Government_Effectiveness']))

print("\nNumber of observations in final merged dataset:", len(merged_df))
print("\nSummary of merged data:")
print(merged_df.describe())


Correlation between Populism Score and Government Effectiveness:
-0.2193563801088264

Number of observations in final merged dataset: 482

Summary of merged data:
              Year  Government_Effectiveness  Populism_Score
count   482.000000                482.000000      482.000000
mean   2009.481328                  0.290762        0.291551
std       4.943828                  0.665041        0.337687
min    2000.000000                 -1.606984        0.000000
25%    2006.000000                  0.178269        0.037500
50%    2009.000000                  0.208498        0.187500
75%    2014.000000                  0.275531        0.375000
max    2021.000000                  2.214624        1.733333


In [None]:
# Save to JSON with the specified filename
chart.save('Project_chart3.json')

In [None]:
# Check observations where Populism Score is 0
zero_populism = merged_df[merged_df['Populism_Score'] == 0]

print("Number of observations with Populism Score = 0:", len(zero_populism))
print("\nBreakdown by country and year for zero populism scores:")
print(zero_populism[['Country Name', 'Year', 'Populism_Score', 'Government_Effectiveness']].sort_values('Country Name'))

# Let's also check the original data before averaging
zero_populism_raw = df_new[df_new['averagerubric'] == 0]
print("\nSummary of original data where averagerubric = 0:")
print("Number of speeches with zero scores:", len(zero_populism_raw))
print("\nBreakdown by speechtype:")
print(zero_populism_raw['speechtype'].value_counts())

Number of observations with Populism Score = 0: 100

Breakdown by country and year for zero populism scores:
     Country Name  Year  Populism_Score  Government_Effectiveness
98        Albania  2005             0.0                 -0.696387
99        Albania  2005             0.0                  0.208882
172     Argentina  2008             0.0                 -0.123509
173     Argentina  2008             0.0                  0.216380
43        Armenia  2003             0.0                  0.259704
..            ...   ...             ...                       ...
38         Sweden  2002             0.0                  1.962593
156  Turkmenistan  2006             0.0                 -1.517654
157  Turkmenistan  2006             0.0                  0.296224
15     Uzbekistan  2000             0.0                  0.395829
14     Uzbekistan  2000             0.0                 -1.019983

[100 rows x 4 columns]

Summary of original data where averagerubric = 0:
Number of speeches with 

In [None]:
# Count observations by year
yearly_counts = merged_df['Year'].value_counts().sort_index()

print("Number of observations per year:")
print(yearly_counts)

print("\nTotal number of observations:", len(merged_df))

# Let's also see average populism score by year
yearly_avg = merged_df.groupby('Year')[['Populism_Score', 'Government_Effectiveness']].agg({
    'Populism_Score': ['mean', 'count'],
    'Government_Effectiveness': 'mean'
}).round(3)

print("\nYearly averages and counts:")
print(yearly_avg)

Number of observations per year:
Year
2000    16
2002    24
2003    20
2004    38
2005    22
2006    38
2007    14
2008    42
2009    32
2010    30
2011    26
2012    26
2013    26
2014    42
2015    18
2016    32
2017    16
2018    12
2019     6
2021     2
Name: count, dtype: int64

Total number of observations: 482

Yearly averages and counts:
     Populism_Score       Government_Effectiveness
               mean count                     mean
Year                                              
2000          0.099    16                    0.319
2002          0.157    24                    0.378
2003          0.343    20                    0.181
2004          0.255    38                    0.280
2005          0.299    22                    0.206
2006          0.373    38                    0.287
2007          0.570    14                    0.245
2008          0.289    42                    0.225
2009          0.305    32                    0.313
2010          0.307    30               

In [None]:
# Check for NaN values in the merged dataset
print("NaN values in merged_df:")
print(merged_df.isna().sum())

# Check for NaN values in the original datasets before merging
print("\nNaN values in government effectiveness data:")
print(df_gov_melted.isna().sum())

print("\nNaN values in populism data before merging:")
print(country_data.isna().sum())

# Look at rows with any NaN values
print("\nRows containing any NaN values in merged_df:")
print(merged_df[merged_df.isna().any(axis=1)])

# Also check for 'None' or empty string values that might not be caught by isna()
print("\nUnique values in each column to check for unusual values:")
for column in merged_df.columns:
    print(f"\n{column}:")
    print(merged_df[column].unique()[:10], "...")  # Show first 10 unique values

NaN values in merged_df:
Country Name                0
Year                        0
Government_Effectiveness    0
Populism_Score              0
dtype: int64

NaN values in government effectiveness data:
Country Name                  0
Year                          0
Government_Effectiveness    340
dtype: int64

NaN values in populism data before merging:
Country Name      0
Year              0
Populism_Score    0
dtype: int64

Rows containing any NaN values in merged_df:
Empty DataFrame
Columns: [Country Name, Year, Government_Effectiveness, Populism_Score]
Index: []

Unique values in each column to check for unusual values:

Country Name:
['Austria' 'Chile' 'Croatia' 'Dominican Republic' 'Ecuador' 'Spain'
 'Uruguay' 'Uzbekistan' 'Argentina' 'Australia'] ...

Year:
[2000 2002 2003 2004 2005 2006 2007 2008 2009 2010] ...

Government_Effectiveness:
[ 1.84726524  0.2030091   1.00555968  0.36339438  0.23566958 -0.42515957
  0.23857871 -0.68529028  0.21671158  1.68575728] ...

Populism_Sco

*CHART 4*

a3bee049-01af-4985-b2d2-42694bee774c_Data.csv

In [None]:
import pandas as pd
import altair as alt
import numpy as np

# Define OECD country codes
oecd_country_codes = [
    'AUS', 'AUT', 'BEL', 'CAN', 'CHL', 'COL', 'CRI', 'CZE', 'DNK', 'EST',
    'FIN', 'FRA', 'DEU', 'GRC', 'HUN', 'ISL', 'IRL', 'ISR', 'ITA', 'JPN',
    'KOR', 'LVA', 'LTU', 'LUX', 'MEX', 'NLD', 'NZL', 'NOR', 'POL', 'PRT',
    'SVK', 'SVN', 'ESP', 'SWE', 'CHE', 'TUR', 'GBR', 'USA'
]

# Define years
START_YEAR = 2000
END_YEAR = 2022

# GDP data processing
df_gdp = pd.read_csv('a3bee049-01af-4985-b2d2-42694bee774c_Data.csv')
df_gdp_oecd = df_gdp[df_gdp['Country Code'].isin(oecd_country_codes)]
year_cols = [col for col in df_gdp_oecd.columns if '[YR' in col]

# Calculate average GDP for each year and take the log
gdp_data = []
for col in year_cols:
    year = int(col.split('[YR')[1].split(']')[0])
    avg_gdp = pd.to_numeric(df_gdp_oecd[col], errors='coerce').mean()
    if not pd.isna(avg_gdp) and avg_gdp > 0:  # Ensure we don't take log of negative or zero
        gdp_data.append({
            'year': year,
            'value': np.log10(avg_gdp),
            'metric': 'Log(GDP PPP)',
            'original_value': avg_gdp
        })

gdp_df = pd.DataFrame(gdp_data)

# Populism data processing
# Convert year columns to numeric
df['yearbegin'] = pd.to_numeric(df['yearbegin'], errors='coerce')
df['yearend'] = pd.to_numeric(df['yearend'], errors='coerce')

populism_data = []
for year in range(START_YEAR, END_YEAR + 1):
    year_speeches = df[
        (df['yearbegin'] <= year) &
        (df['yearend'] >= year)
    ]
    if not year_speeches.empty:
        avg_rubric = year_speeches['averagerubric'].mean()
        populism_data.append({
            'year': year,
            'value': avg_rubric,
            'metric': 'Populism Score',
            'speech_count': len(year_speeches)
        })

populism_df = pd.DataFrame(populism_data)

# Combine both datasets
plot_data = pd.concat([gdp_df, populism_df], ignore_index=True)

# Create tooltips for populism line
populism_tooltip = [
    alt.Tooltip('year:Q', title='Year'),
    alt.Tooltip('value:Q', title='Populism Score', format='.3f'),
    alt.Tooltip('speech_count:Q', title='Number of Speeches')
]

# Create tooltips for GDP line
gdp_tooltip = [
    alt.Tooltip('year:Q', title='Year'),
    alt.Tooltip('value:Q', title='Log(GDP PPP)', format='.3f'),
    alt.Tooltip('original_value:Q', title='GDP PPP', format=',.0f')
]

# Create the base chart
base = alt.Chart(plot_data)

# Create populism line (left y-axis)
populism_line = base.mark_line(strokeWidth=2).encode(
    x=alt.X('year:Q',
            scale=alt.Scale(domain=[START_YEAR, END_YEAR]),
            axis=alt.Axis(title='Year', format='d')),
    y=alt.Y('value:Q',
            title='Populism Score',
            scale=alt.Scale(domain=[0, 0.5])),
    color=alt.value('#E41A1C'),  # Red color for populism
    tooltip=populism_tooltip
).transform_filter(
    alt.datum.metric == 'Populism Score'
)

# Create GDP line (right y-axis)
gdp_line = base.mark_line(strokeWidth=2).encode(
    x='year:Q',
    y=alt.Y('value:Q',
            title='Log(GDP PPP)',
            scale=alt.Scale(domain=[11.8, 12.4])),
    color=alt.value('#2E86AB'),  # Blue color for GDP
    tooltip=gdp_tooltip
).transform_filter(
    alt.datum.metric == 'Log(GDP PPP)'
)

# Create points for both lines with tooltips
populism_points = populism_line.mark_point(size=60, opacity=0.7).encode(tooltip=populism_tooltip)
gdp_points = gdp_line.mark_point(size=60, opacity=0.7).encode(tooltip=gdp_tooltip)

# Add a legend
legend = alt.Chart(pd.DataFrame([
    {'metric': 'Populism Score', 'color': '#E41A1C'},
    {'metric': 'Log(GDP PPP)', 'color': '#2E86AB'}
])).mark_point(size=100, filled=True).encode(
    y=alt.Y('metric:N', axis=alt.Axis(title=None)),
    color=alt.Color('color:N', scale=None)
)

# Combine charts with independent y-axes
final_chart = alt.layer(
    populism_line, gdp_line, populism_points, gdp_points
).resolve_scale(
    y='independent'
).properties(
    width=800,
    height=400,
    title={
        'text': 'Evolution of Populist Discourse and GDP in OECD Countries',
        'subtitle': 'Based on speeches from national leaders and GDP PPP (2000 - 2022)',
        'color': '#333333',
        'subtitleColor': '#666666',
        'fontSize': 14,
        'subtitleFontSize': 11,
        'font': 'Helvetica',
        'fontWeight': 'bold',
        'subtitleFontWeight': 'normal',
        'anchor': 'start'
    }
).configure_axis(
    grid=True,
    gridColor='#E9ECEF',
    gridWidth=1,
    tickColor='#CCCCCC',
    domainColor='#CCCCCC',
    labelColor='#666666',
    titleColor='#666666'
).configure_view(
    strokeWidth=0
)

# Display the chart
final_chart

In [None]:
final_chart.save('project_chart4.json')

*CHART 6: how much is talked about illegal immigration in the news*

In [None]:
df_migrants = pd.read_csv("migrants_news.csv")

In [None]:
df_migrants.sort_values(by="Date", ascending=True)

Unnamed: 0,GOID,Title,Date,Source Type,Authors,Publication ID,Publication Title,Publisher City,Publisher Province
28636,91828019,It's the American Way,2000-01-02,Newspapers,"['CROSSETTE, BARBARA']",45545,New York Times (1923-),NEW YORK,NY
34133,431356964,Europe Stares at a Future Built by Immigrants,2000-01-02,Newspapers,"['Crossette, Barbara']",11561,New York Times,NEW YORK,NY
10722,418967492,400 PROTEST DETENTION OF SHIP'S ILLEGAL IMMIGR...,2000-01-03,Newspapers,,46852,Chicago Tribune,Chicago,IL
18536,398715943,World-Wide,2000-01-03,Newspapers,,10482,Wall Street Journal,South Brunswick,NJ
10731,419060559,HEALTH EXPERTS URGE SCREENING MIGRANTS FOR TB,2000-01-03,Newspapers,,46852,Chicago Tribune,Chicago,IL
...,...,...,...,...,...,...,...,...,...
6169,3148523763,Thailand: Public Health Ministry to continue p...,2024-12-24,Newspapers,,656306,Asia News Monitor,Bangkok,
6170,3148700744,THE WORLD; Destructive Cyclone Chido unearths ...,2024-12-24,Newspapers,"['Mednick, Sam', 'Adamson, Thomas']",46999,Los Angeles Times,Los Angeles,CA
6171,3148700757,PERSPECTIVES; Mexican officials are wary of Tr...,2024-12-24,Newspapers,"['Wilkinson, Tracy', 'Linthicum, Kate']",46999,Los Angeles Times,Los Angeles,CA
6174,3148748004,Joe Biden commutes sentences of 37 federal dea...,2024-12-24,Newspapers,,55189,The Financial Daily,Karachi,


In [None]:
import pandas as pd
import altair as alt

def plot_migrant_news_distribution(file_path):
   df = pd.read_csv(file_path)
   df['Date'] = pd.to_datetime(df['Date'])
   df['Year'] = df['Date'].dt.year

   yearly_counts = df['Year'].value_counts().reset_index()
   yearly_counts.columns = ['Year', 'Count']

   title = alt.TitleParams(
       text='Distribution of Migration News Articles (2000-2024)',
       subtitle='Data fetched with ProQuest TDM as of 24/12/2024 from Newspaper publications',
       anchor='middle'
   )

   chart = alt.Chart(yearly_counts).mark_bar(
       cornerRadiusTopLeft=3,
       cornerRadiusTopRight=3
   ).encode(
       x=alt.X('Year:O',
               axis=alt.Axis(labelAngle=-45, title='Year')),
       y=alt.Y('Count:Q',
               axis=alt.Axis(title='Number of Articles')),
       tooltip=['Year', 'Count']
   ).properties(
       width=800,
       height=400,
       title=title
   ).configure_axis(
       grid=True,
       gridColor='#f0f0f0'
   ).configure_view(
       stroke=None
   )

   return chart

chart = plot_migrant_news_distribution('migrants_news.csv')
chart

In [None]:
chart.save('project_chart6.json')

*CHART 7*

In [None]:
import pandas as pd
import altair as alt
import numpy as np

def plot_regression_analysis():
    # Load and preprocess data
    df_migrants = pd.read_csv('migrants_news.csv')
    df_migrants['Date'] = pd.to_datetime(df_migrants['Date'])
    yearly_migrants = df_migrants.groupby(df_migrants['Date'].dt.year).size().reset_index()
    yearly_migrants.columns = ['year', 'count']

    data = pd.DataFrame({
        'year': list(range(2000, 2023)),
        'news_count': yearly_migrants[yearly_migrants['year'].between(2000, 2022)]['count'],
        'gdp': [11.9, 12.0, 12.05, 12.1, 12.15, 12.2, 12.22, 12.25, 12.3, 12.28, 12.32, 12.35, 12.35, 12.4, 12.4, 12.4, 12.4, 12.4, 12.4, 12.4, 12.35, 12.35, 12.4],
        'populism_score': [0.15, 0.18, 0.17, 0.17, 0.19, 0.19, 0.18, 0.19, 0.19, 0.16, 0.17, 0.20, 0.22, 0.24, 0.25, 0.29, 0.29, 0.30, 0.32, 0.35, 0.37, 0.35, 0.39]
    })

    # Create smoother trend line using more points
    x_range = np.linspace(data['news_count'].min(), data['news_count'].max(), 100)
    y_pred = -1.5738 + (0.00005894 * x_range) + (0.1406 * data['gdp'].mean())
    trend_data = pd.DataFrame({'news_count': x_range, 'predicted_populism': y_pred})

    # Scatter plot
    scatter = alt.Chart(data).mark_circle(size=60).encode(
        x=alt.X('news_count:Q',
                title='Number of Migration News Articles',
                axis=alt.Axis(labelFontSize=12, titleFontSize=14)),
        y=alt.Y('populism_score:Q',
                title='Populism Score',
                axis=alt.Axis(labelFontSize=12, titleFontSize=14, grid=True, gridColor='#f0f0f0')),
        color=alt.Color('gdp:Q',
                        title='Log(GDP)',
                        scale=alt.Scale(scheme='viridis')),
        tooltip=[
            alt.Tooltip('year:Q', title='Year'),
            alt.Tooltip('news_count:Q', title='Migration News Articles'),
            alt.Tooltip('populism_score:Q', title='Populism Score', format='.2f'),
            alt.Tooltip('gdp:Q', title='GDP', format='.2f')
        ]
    )

    # Trend line
    line = alt.Chart(trend_data).mark_line(color='#35978f', strokeDash=[5, 5], strokeWidth=2).encode(
        x=alt.X('news_count:Q'),
        y=alt.Y('predicted_populism:Q')
    )

    # Combine scatter plot and trend line
    chart = (scatter + line).properties(
        width=800,
        height=500,
        title=alt.TitleParams(
            text='Relationship between Migration News Coverage and Populism',
            subtitle='R² = 0.741, controlling for GDP. Data from 2000-2022',
            anchor='start',
            fontSize=16,
            subtitleFontSize=14,
            subtitleColor='#666666'
        )
    ).configure_view(
        stroke=None
    ).configure_legend(
        labelFontSize=12,
        titleFontSize=14
    )

    return chart

chart = plot_regression_analysis()
chart


In [None]:
chart.save('project_chart7.json')

*CHART 8*

In [None]:
import pandas as pd
import altair as alt

# Disable max rows limit for Altair
alt.data_transformers.disable_max_rows()

# Process data for all years
yearly_scores = []
for country in df['country'].unique():
    country_data = df[df['country'] == country]

    # Replace 'present' with '2022' and ensure numeric years
    country_data = country_data.copy()
    country_data['yearend'] = country_data['yearend'].replace('present', '2022')
    country_data['yearend'] = pd.to_numeric(country_data['yearend'])
    country_data['yearbegin'] = pd.to_numeric(country_data['yearbegin'])

    # For each country-year combination
    for year in range(2000, 2023):  # Range from 2000 to 2022
        # Find relevant periods that include this year
        relevant_data = country_data[
            (country_data['yearbegin'] <= year) &
            (country_data['yearend'] >= year)
        ]

        if not relevant_data.empty:
            avg_score = relevant_data['averagerubric'].mean()
            if pd.notna(avg_score):
                yearly_scores.append({
                    'country': country,
                    'year': year,
                    'score': float(avg_score)  # Ensure float type
                })

# Create DataFrame
scores_df = pd.DataFrame(yearly_scores)

# Country codes mapping
country_to_id = {
    'Australia': '036',
    'Austria': '040',
    'Belgium': '056',
    'Canada': '124',
    'Chile': '152',
    'Colombia': '170',
    'Costa Rica': '188',
    'Czech Republic': '203',
    'Denmark': '208',
    'Estonia': '233',
    'Finland': '246',
    'France': '250',
    'Germany': '276',
    'Greece': '300',
    'Hungary': '348',
    'Iceland': '352',
    'Ireland': '372',
    'Israel': '376',
    'Italy': '380',
    'Japan': '392',
    'Korea': '410',
    'Latvia': '428',
    'Lithuania': '440',
    'Luxembourg': '442',
    'Mexico': '484',
    'Netherlands': '528',
    'New Zealand': '554',
    'Norway': '578',
    'Poland': '616',
    'Portugal': '620',
    'Slovak Republic': '703',
    'Slovenia': '705',
    'Spain': '724',
    'Sweden': '752',
    'Switzerland': '756',
    'Turkey': '792',
    'United States': '840'
}

# Convert IDs to integers and add to DataFrame
scores_df['id'] = scores_df['country'].map(country_to_id).astype(int)

# Create year selector
year_param = alt.param(
    name='year',
    value=2022,  # Default value
    bind=alt.binding_range(
        min=2000,
        max=2022,
        step=1,
        name='Select Year: '
    )
)

# Create base map
base = alt.Chart(alt.topo_feature('https://vega.github.io/vega-datasets/data/world-110m.json', 'countries')).mark_geoshape(
    stroke='white',
    strokeWidth=0.5
).encode(
    color=alt.value('#F0F0F0')
).properties(
    width=800,
    height=400
)

# Create choropleth layer
choropleth = alt.Chart(alt.topo_feature('https://vega.github.io/vega-datasets/data/world-110m.json', 'countries')).mark_geoshape(
    stroke='white',
    strokeWidth=0.5
).encode(
    color=alt.Color(
        'score:Q',
        scale=alt.Scale(
            scheme='viridis',
            domain=[0, 2]  # Fixed domain for populism scores
        ),
        title='Populism Score'
    ),
    tooltip=[
        alt.Tooltip('country:N', title='Country'),
        alt.Tooltip('score:Q', title='Populism Score', format='.2f'),
        alt.Tooltip('year:Q', title='Year', format='d')
    ]
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(data=scores_df, key='id', fields=['country', 'score', 'year'])
).transform_filter(
    alt.datum.year == year_param
).add_params(
    year_param
)

# Combine layers
map_chart = (base + choropleth).properties(
    title={
        'text': 'Populist Discourse Scores in OECD Countries',
        'subtitle': ['Use the slider to explore different years (2000-2022)', 'Note: Some countries may have missing data for certain years'],
        'fontSize': 16,
        'subtitleFontSize': 14,
        'subtitleColor': '#666666'
    }
)

# Add source citation
source = alt.Chart(pd.DataFrame({
    'text': ['Source: Hawkins et al. (2019) Global Populism Database, Harvard Dataverse, V2']
})).mark_text(
    fontSize=11,
    color='#666666'
).encode(
    text='text:N'
)

# Create final visualization
final_chart = (
    alt.vconcat(map_chart, source, spacing=5)
    .configure_view(strokeWidth=0)
    .configure(background='#F8F9FA')
    .configure_axis(
        labelFontSize=11,
        titleFontSize=12
    )
    .configure_legend(
        titleFontSize=12,
        labelFontSize=11
    )
)

# Display the chart
final_chart

# final_chart.save('interactive_populism_map.html')

In [None]:
final_chart

In [None]:
final_chart.save('project_chart8.json')

*CHART 9*

In [None]:
import pandas as pd
import altair as alt
import numpy as np

def plot_regression_analysis():
    df_migrants = pd.read_csv('migrants_news.csv')
    df_migrants['Date'] = pd.to_datetime(df_migrants['Date'])
    yearly_migrants = df_migrants.groupby(df_migrants['Date'].dt.year).size().reset_index()
    yearly_migrants.columns = ['year', 'count']

    data = pd.DataFrame({
        'year': list(range(2000, 2023)),
        'news_count': yearly_migrants[yearly_migrants['year'].between(2000, 2022)]['count'],
        'gdp': [11.9, 12.0, 12.05, 12.1, 12.15, 12.2, 12.22, 12.25, 12.3, 12.28, 12.32, 12.35, 12.35, 12.4, 12.4, 12.4, 12.4, 12.4, 12.4, 12.4, 12.35, 12.35, 12.4],
        'populism_score': [0.15, 0.18, 0.17, 0.17, 0.19, 0.19, 0.18, 0.19, 0.19, 0.16, 0.17, 0.20, 0.22, 0.24, 0.25, 0.29, 0.29, 0.30, 0.32, 0.35, 0.37, 0.35, 0.39]
    })

    # Create smoother trend line using more points
    x_range = np.linspace(data['news_count'].min(), data['news_count'].max(), 100)
    y_pred = -1.5738 + (0.00005894 * x_range) + (0.1406 * data['gdp'].mean())
    trend_data = pd.DataFrame({'news_count': x_range, 'predicted_populism': y_pred})

    # Create checkbox parameter
    zero_param = alt.param(
        name='Start_at_zero',
        value=False,
        bind=alt.binding_checkbox(name='Start Y-axis at Zero ')
    )

    # Create base chart with parameter
    scatter = alt.Chart(data).mark_circle(size=60).encode(
        x=alt.X('news_count', title='Number of Migration News Articles', axis=alt.Axis(labelFontSize=12, titleFontSize=14)),
        y=alt.Y('populism_score', title='Populism Score', scale=alt.Scale(zero=zero_param), axis=alt.Axis(labelFontSize=12, titleFontSize=14, grid=True, gridColor='#f0f0f0')),
        color=alt.Color('gdp', title='Log(GDP)', scale=alt.Scale(scheme='viridis')),
        tooltip=['year', 'news_count', 'populism_score', 'gdp']
    ).add_params(zero_param)

    line = alt.Chart(trend_data).mark_line(color='#35978f', strokeDash=[5, 5], strokeWidth=2).encode(
        x='news_count',
        y=alt.Y('predicted_populism', scale=alt.Scale(zero=zero_param))
    )

    # Combine scatter plot and trend line
    return (scatter + line).properties(
        width=800,
        height=500,
        title=alt.TitleParams(
            text='Relationship between Migration News Coverage and Populism',
            subtitle='R² = 0.741, controlling for GDP. Data from 2000-2022',
            anchor='start',
            fontSize=16,
            subtitleFontSize=14,
            subtitleColor='#666666'
        )
    ).configure_view(
        stroke=None
    ).configure_legend(
        labelFontSize=12,
        titleFontSize=14
    )

# Create and display the chart
chart = plot_regression_analysis()
chart


In [None]:
chart.save('project_chart9.json')