# Exercise 1
Assume that: 
- $\mu^{\text{blue}}_m$ = 25.0. 
- $\sigma^{\text{blue}}_m$ = 3.0.
- $\mu^{\text{blue}}_s$ = 5.0. 
- $\sigma^{\text{blue}}_s$ = 5.0. 
- $\mu^{\text{red}}_m$ = 30.0. 
- $\sigma^{\text{red}}_m$ = 3.0. 
- $\mu^{\text{red}}_s$ = 3.5. 
- $\sigma^{\text{red}}_s$ = 2.5. 
- $\sigma_u$ = 0.5.
- $\bar{p}$ = 155.
- $p_m$ = 13.5.
- $p_s$ = 6.5.

With these parameter values:
1. Create a Dataframe that contains all synthetic data.
2. Plot histograms for each variable/type.
3. Plot correlation between wages and its determinants.

In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['figure.figsize'] = [12.0,6.0] # Modifies the defult size of plots [6.0,4.0] inches

In [None]:
# Define parameters
mu_m_b = 25
sigma_m_b = 3.0
mu_s_b = 5.0
sigma_s_b = 5.0
mu_m_r = 30.0
sigma_m_r = 3.0
mu_s_r = 3.5
sigma_s_r = 2.5
sigma_u = 0.5
p_bar = 155
p_m = 13.5
p_s = 6.5
N = 5000


# Create Dataframe for blues
m_b = pd.Series(np.random.normal(mu_m_b, sigma_m_b, N), name = 'Memory')
s_b = pd.Series(np.random.normal(mu_s_b, sigma_s_b, N), name = 'Speed')
u_b = pd.Series(np.random.normal(0, sigma_u, N), name = 'Match Quality')
dfblue = pd.DataFrame({
    'Memory': m_b,
    'Speed': s_b,
    'Match Quality': u_b,
    'Type': 0,
    'Type Name': 'Blue'
})

# Create Dataframe for reds
m_r = pd.Series(np.random.normal(mu_m_r, sigma_m_r, N), name = 'Memory')
s_r = pd.Series(np.random.normal(mu_s_r, sigma_s_r, N), name = 'Speed')
u_r = pd.Series(np.random.normal(0, sigma_u, N), name = 'Match Quality')
dfred = pd.DataFrame({
    'Memory': m_r,
    'Speed': s_r,
    'Match Quality': u_r,
    'Type': 1,
    'Type Name': 'Red'
})

# Concatenate Dataframes
df = pd.concat([dfblue, dfred], ignore_index=True)
df['Wage'] = p_bar + p_m*df['Memory'] + p_s*df['Speed'] + df['Match Quality']
df

In [None]:
# Plot Memory histogram
df.hist(column = 'Memory', by = 'Type Name', bins = 100)
plt.show()

In [None]:
# Plot Speed histogram
df.hist(column = 'Speed', by = 'Type Name', bins = 100)
plt.show()

In [None]:
# Plot Wage histogram
df.hist(column = 'Wage', by = 'Type Name', bins = 100)
plt.show()

In [None]:
# Correlation between Memory and Wage
x = df['Memory']
y = df['Wage']
df.plot(x = 'Memory', y = 'Wage', kind = 'scatter')
z = np.polyfit(x, y, 1)
p = np.poly1d(z)
plt.plot(x, p(x),"r--")
plt.show()

In [None]:
# Correlation between Speed and Wage
x = df['Speed']
y = df['Wage']
df.plot(x = 'Speed', y = 'Wage', kind = 'scatter')
z = np.polyfit(x, y, 1)
p = np.poly1d(z)
plt.plot(x, p(x),"r--")
plt.show()

In [None]:
# Correlation between Quality Match and Wage
x = df['Match Quality']
y = df['Wage']
df.plot(x = 'Match Quality', y = 'Wage', kind = 'scatter')
z = np.polyfit(x, y, 1)
p = np.poly1d(z)
plt.plot(x, p(x),"r--")
plt.show()

# Exercise 2
Use the module [statsmodel](https://www.statsmodels.org/stable/index.html) to estimate the prices.

In [None]:
import statsmodels.api as sm

# Create constant
df['constant'] = 1

# Define model
MyModel = sm.OLS(endog = df['Wage'], 
                 exog = df[['constant', 'Memory', 'Speed']])

# Print results
results = MyModel.fit()
print(results.summary())

# Exercise 3
Unconditional differences in wages between groups.

In [None]:
df[['Wage', 'Type Name']].groupby('Type Name').describe()

Model with omitted variables.

In [None]:
# Define model
Omitted = sm.OLS(endog = df['Wage'], 
                 exog = df[['constant', 'Memory', 'Type']])

# Print results
results = Omitted.fit()
print(results.summary())

# Exercise 4
Load data from Acemoglu, Johnson, and Robinson (AER-2001) and:
1. Bar plot their GDP measure (`logpgp95`) and their measure of instiutional quality (`avexpr`).
2. Ilustrate the correlation between these two variables.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
MyFolder = '/content/drive/My Drive/Colab Notebooks/2021_2022_Programming_for_Statistical_Analysis/'
MyFile = MyFolder + 'Data_AJR_2001.dta' 

In [None]:
# Load data
dfAJR = pd.read_stata(MyFile)
 
# Set proper index (Country abbreviation)
dfAJR.index = dfAJR['shortnam']
dfAJR = dfAJR.drop(columns = ['shortnam'])
dfAJR

In [None]:
# Bar plot a sample of N countries
N = 25
aux_dfAJR = dfAJR.sample(n = N, random_state = 1)


# Bar plot GDP
aux_dfAJR.sort_values('logpgp95', ascending=False)['logpgp95'].dropna().plot(kind='bar')
plt.ylabel('Log of per capita GPD in 1995')
plt.xlabel('Countries')
plt.title('Sample of {} Countries'.format(N))
plt.show()

In [None]:
# Bar plot Expropiation Index
aux_dfAJR.sort_values('avexpr', ascending=False)['avexpr'].dropna().plot(kind='bar')
plt.ylabel('Index of protection against expropiation (Average 1985-95)')
plt.xlabel('Countries')
plt.title('Sample of {} Countries'.format(N))
plt.show()

In [None]:
# Correlation between GDP and Expropiation Index
aux_dfAJR = dfAJR.dropna(subset=['logpgp95', 'avexpr'])
x = aux_dfAJR['avexpr']
y = aux_dfAJR['logpgp95']
aux_dfAJR.plot(x = 'avexpr', y = 'logpgp95', kind = 'scatter')
z = np.polyfit(x, y, 1)
p = np.poly1d(z)
plt.plot(x, p(x),"r--")
plt.xlabel('Index of protection against expropiation (Average 1985-95)')
plt.ylabel('Log of per capita GPD in 1995')
plt.show()

In [None]:
# Create constant
dfAJR['constant'] = 1

# Define model
MyModel = sm.OLS(endog = dfAJR['logpgp95'], 
                 exog = dfAJR[['constant', 'avexpr']], missing = 'drop')

# Print results
results = MyModel.fit()
print(results.summary())