In [1]:
import pandas as pd
import polars as pl
import numpy as np
import scipy.stats as ss
from matplotlib import pyplot as plt

# MSCI World - Analysis of returns distribution
Source: https://www.msci.com/end-of-day-history?chart=regional&priceLevel=0&scope=R&style=C&asOf=May%2003,%202024&currency=15&size=36&indexId=106

The MSCI World Index captures large and mid-cap representation across 23 Developed Markets (DM) countries. With 1,465 constituents, the index covers approximately 85% of the free float-adjusted market capitalization in each country.

In [2]:
df=pl.read_excel("MSCI_World_historyIndex.xls", sheet_name="History_Clean")
df.head()

Date,Year,Price
str,i64,f64
"""Dec 31, 1969""",1969,100.0
"""Dec 31, 1970""",1970,94.292
"""Dec 31, 1971""",1971,108.993
"""Dec 29, 1972""",1972,130.737
"""Dec 31, 1973""",1973,108.41


In [22]:
#Verify price variation against previous year
lag = 10
df2=df.with_columns(PY=pl.col("Year")-lag)
df2=df2.join(df,how='inner',left_on='PY',right_on='Year')
df2=df2.drop('Date','Date_right','PY_right')
df2=df2.rename({'Price_right':'Prev. Price'})
df2=df2.with_columns(Price_var=(pl.col("Price")-pl.col("Prev. Price"))/pl.col("Prev. Price"))
df2.head()

Year,Price,PY,Prev. Price,Price_var
i64,f64,i64,f64,f64
1979,131.101,1969,100.0,0.31101
1980,159.228,1970,94.292,0.688669
1981,146.62,1971,108.993,0.345224
1982,155.156,1972,130.737,0.18678
1983,183.952,1973,108.41,0.696818


In [23]:
#plot price variation over the years
year = df2.select("Year")
lb = year.min().to_numpy()[0][0]
ub = year.max().to_numpy()[0][0]
df2.plot.bar(y="Price_var",x="Year",title="MSCI World - Yearly Price variations 1970-2023",ylabel='Price variation')

In [27]:
#TRY DATA EXPLORER
import hvplot.pandas

dfp = df2.to_pandas()
explorer = dfp.hvplot.explorer(x="Year",y="Price_var")
explorer

In [25]:
#plot the results
df2.plot.hist("Price_var", title="MSCI World - Price variation distribution 1970-2023",bins=15)

In [26]:
#main distribution statistics
descr=df2.select("Price_var").describe()
descr

statistic,Price_var
str,f64
"""count""",45.0
"""null_count""",0.0
"""mean""",1.13861
"""std""",0.866247
"""min""",-0.19977
"""25%""",0.593571
"""50%""",0.907916
"""75%""",1.646114
"""max""",3.327488


In [31]:
prob_loss = df2.select("Price_var").to_series().lt(0).sum()/45
prob_loss

0.044444444444444446

In [19]:
# verify autocorrelation
lags = range(1,11)
price_var = df2.select(pl.col("Price_var")).to_numpy()
price_var = np.reshape(price_var,price_var.size)
#mean = price_var.mean()
#price_var = price_var-mean
corr=[]
for l in lags:
    price_var1=price_var[:-l]
    price_var2=price_var[l:]
    correl = np.corrcoef(price_var1,price_var2)[0][1]
    corr.append([l,correl])
corr_df=pl.DataFrame(corr)
corr_df=corr_df.rename({"column_0":"lag (years)","column_1":"correlation"})
corr_df.plot.bar(x="lag (years)",y="correlation",title="Autocorrelation",ylim=[-1,1])

In [20]:
# Anderson-Darling normality test - it should confirm that distribution of returns is not normal

norm_fit = ss.anderson(price_var,'norm')
#print(norm_fit)
test_stat = norm_fit.statistic
critical_5p = norm_fit.critical_values[2]
print(test_stat)
print("critical values (respective significance levels here below):",norm_fit.critical_values)
print(norm_fit.significance_level)

if test_stat > critical_5p:
    print("Distribution is NOT normal with significance of 5%")
else:
    print("Distribution is normal with significance of 5%")

0.9954005711332243
critical values (respective significance levels here below): [0.527 0.6   0.719 0.839 0.998]
[15.  10.   5.   2.5  1. ]
Distribution is NOT normal with significance of 5%


# Monte Carlo simulation

In [145]:
# set distribution parameters - 1 variable
yyield_mean = price_var.mean() # yearly yield mean value
yyield_devstd = price_var.std() # yearly yield std. deviation

yyield_stats = [yyield_mean, yyield_devstd]
print(yyield_stats)

[0.08089952419454931, 0.16909594555286153]


In [146]:
# set simulation parameters
n_var = 1 # 1 variable only
n_years = 10 # years of investment horizon
n_runs = 1000000
seed = 2304920942 # seed of random uniform distribution
tot = n_runs*n_years

random_space = ss.uniform.rvs(size=tot, random_state=seed)
random_space = random_space.reshape(n_runs, n_years)
print(random_space)

[[0.08756659 0.03230368 0.47371698 ... 0.39309551 0.02743384 0.11076262]
 [0.60458343 0.14430441 0.24802156 ... 0.10527448 0.37260705 0.85045187]
 [0.44698196 0.40019284 0.24230134 ... 0.78826443 0.61792918 0.70708538]
 ...
 [0.31281314 0.66440861 0.6493689  ... 0.75861876 0.2542469  0.31707911]
 [0.26391941 0.92638218 0.53669957 ... 0.02018919 0.51382973 0.7676381 ]
 [0.00762797 0.81697598 0.23148259 ... 0.84246908 0.271065   0.68466528]]


In [147]:
# run the simulation
random_yield = ss.norm.ppf(random_space,loc=yyield_mean,scale=yyield_devstd)
ones = np.ones([n_runs, n_years])
random_yield = ones+random_yield
result = np.prod(random_yield,axis=1)
result_yearly = np.power(result,np.ones(n_runs)*1/n_years)-1
result = pl.Series(result)
result_yearly = pl.Series(result_yearly)

In [155]:
#plot yields after 10 years

result_yearly.hvplot.hist()

In [156]:
#statistics
series = result_yearly
series.describe()

Mean:  2.176512174081688
Std deviation:  1.1374101100054184
Median:  1.9432904508686621
Min:  0.10013379506696174
Max:  17.660646272879575


In [157]:
#even more statistics - N.B.: Value of 2 ==> original investment doubles in X years
prob_loss = result.lt(1).sum()/n_runs
prob_2x = result.gt(2).sum()/n_runs
prob_3x = result.gt(3).sum()/n_runs
print("Probability of loss after",n_years,"years:", round(prob_loss,4)*100,"%")
print("Probability of 2x the original investment after",n_years,"years:", round(prob_2x,4)*100,"%")
print("Probability of 3x the original investment after",n_years,"years:", round(prob_3x,4)*100,"%")

Probability of loss after 10 years: 10.4 %
Probability of 2x the original investment after 10 years: 47.79 %
Probability of 3x the original investment after 10 years: 19.16 %
