In [7]:
import pandas as pd
import polars as pl
import numpy as np
import scipy.stats as ss
from matplotlib import pyplot as plt

# MSCI World - Analysis of returns distribution
Source: https://www.msci.com/end-of-day-history?chart=regional&priceLevel=0&scope=R&style=C&asOf=May%2003,%202024&currency=15&size=36&indexId=106

The MSCI World Index captures large and mid-cap representation across 23 Developed Markets (DM) countries. With 1,465 constituents, the index covers approximately 85% of the free float-adjusted market capitalization in each country.

In [8]:
df=pl.read_excel("MSCI_World_historyIndex_Gross.xls", sheet_name="History_Clean")
df.head()

Date,Year,Value
str,i64,f64
"""Dec 31, 1969""",1969,100.0
"""Dec 31, 1970""",1970,98.017
"""Dec 31, 1971""",1971,117.193
"""Dec 29, 1972""",1972,144.79
"""Dec 31, 1973""",1973,123.787


In [11]:
#Verify price variation against previous year
lag = 1
df2=df.with_columns(PY=pl.col("Year")-lag)
df2=df2.join(df,how='inner',left_on='PY',right_on='Year')
df2=df2.drop('Date','Date_right','PY_right')
df2=df2.rename({'Value_right':'Prev. Value'})
df2=df2.with_columns(Price_var=(pl.col("Value")-pl.col("Prev. Value"))/pl.col("Prev. Value"))
df2.head()

Year,Value,PY,Prev. Value,Price_var
i64,f64,i64,f64,f64
1970,98.017,1969,100.0,-0.01983
1971,117.193,1970,98.017,0.19564
1972,144.79,1971,117.193,0.235483
1973,123.787,1972,144.79,-0.145058
1974,93.484,1973,123.787,-0.2448


In [14]:
#plot price variation over the years
year = df2.select("Year")
lb = year.min().to_numpy()[0][0]
ub = year.max().to_numpy()[0][0]
df2.plot.bar(y="Price_var",x="Year",title="MSCI World - Yearly Value variations 1970-2023",ylabel='Price variation')

In [13]:
#TRY DATA EXPLORER
import hvplot.pandas

dfp = df2.to_pandas()
explorer = dfp.hvplot.explorer(x="Year",y="Price_var")
explorer

In [15]:
#plot the results
df2.plot.hist("Price_var", title="MSCI World - Price variation distribution 1970-2023",bins=15)

In [16]:
#main distribution statistics
descr=df2.select("Price_var").describe()
descr

statistic,Price_var
str,f64
"""count""",54.0
"""null_count""",0.0
"""mean""",0.111799
"""std""",0.175241
"""min""",-0.40334
"""25%""",-0.003209
"""50%""",0.162271
"""75%""",0.232797
"""max""",0.427995


In [19]:
# verify autocorrelation
lags = range(1,11)
price_var = df2.select(pl.col("Price_var")).to_numpy()
price_var = np.reshape(price_var,price_var.size)
#mean = price_var.mean()
#price_var = price_var-mean
corr=[]
for l in lags:
    price_var1=price_var[:-l]
    price_var2=price_var[l:]
    correl = np.corrcoef(price_var1,price_var2)[0][1]
    corr.append([l,correl])
corr_df=pl.DataFrame(corr)
corr_df=corr_df.rename({"column_0":"lag (years)","column_1":"correlation"})
corr_df.plot.bar(x="lag (years)",y="correlation",title="Autocorrelation",ylim=[-1,1])

In [20]:
# Anderson-Darling normality test - it should confirm that distribution of returns is not normal

norm_fit = ss.anderson(price_var,'norm')
#print(norm_fit)
test_stat = norm_fit.statistic
critical_5p = norm_fit.critical_values[2]
print(test_stat)
print("critical values (respective significance levels here below):",norm_fit.critical_values)
print(norm_fit.significance_level)

if test_stat > critical_5p:
    print("Distribution is NOT normal with significance of 5%")
else:
    print("Distribution is normal with significance of 5%")

1.0069479938535437
critical values (respective significance levels here below): [0.541 0.616 0.739 0.862 1.025]
[15.  10.   5.   2.5  1. ]
Distribution is NOT normal with significance of 5%


# Monte Carlo simulation

In [23]:
# set distribution parameters - 1 variable
yyield_mean = price_var.mean() # yearly yield mean value
yyield_devstd = price_var.std() # yearly yield std. deviation

yyield_stats = [yyield_mean, yyield_devstd]
print(yyield_stats)

[0.1117993744626882, 0.1736111220272161]


In [24]:
# set simulation parameters
n_var = 1 # 1 variable only
n_years = 10 # years of investment horizon
n_runs = 1000000
seed = 23049 # seed of random uniform distribution
tot = n_runs*n_years

random_space = ss.uniform.rvs(size=tot, random_state=seed)
random_space = random_space.reshape(n_runs, n_years)
print(random_space.shape)

(1000000, 10)


In [25]:
# run the simulation
random_yield = ss.norm.ppf(random_space,loc=yyield_mean,scale=yyield_devstd)
ones = np.ones([n_runs, n_years])
random_yield = ones+random_yield
result = np.prod(random_yield,axis=1)
result_yearly = np.power(result,np.ones(n_runs)*1/n_years)-1
result = pd.Series(result)
result_yearly = pd.Series(result_yearly)

In [26]:
#plot yields after 10 years

result.hvplot.hist()

In [29]:
#statistics
series = result_yearly
series.describe()

count    1000000.000000
mean           0.099246
std            0.055910
min           -0.183333
25%            0.061603
50%            0.099317
75%            0.136968
max            0.363479
dtype: float64

In [28]:
#even more statistics - N.B.: Value of 2 ==> original investment doubles in X years
prob_loss = result.lt(1).sum()/n_runs
prob_2x = result.gt(2).sum()/n_runs
prob_3x = result.gt(3).sum()/n_runs
print("Probability of loss after",n_years,"years:", round(prob_loss,4)*100,"%")
print("Probability of 2x the original investment after",n_years,"years:", round(prob_2x,4)*100,"%")
print("Probability of 3x the original investment after",n_years,"years:", round(prob_3x,4)*100,"%")

Probability of loss after 10 years: 3.84 %
Probability of 2x the original investment after 10 years: 68.89999999999999 %
Probability of 3x the original investment after 10 years: 38.190000000000005 %
