# Analysis

## Data Processing

In [1]:
import pandas as pd

data = pd.read_csv('data.csv')
x_col, y_col = 'GDP per capita', 'Score'
X = data[x_col]
Y = data[y_col]

FileNotFoundError: [Errno 2] No such file or directory: 'data.csv'

In [None]:
import matplotlib.pyplot as plt

plt.close()
plt.scatter(x=X, y=Y, s=3)
plt.xlabel('GDP per capita')
plt.ylabel('Score')

## Linear Regression

Using Original least squares linear regression

In [None]:
# X must be two dimensional for linear regression
X_2d = X.to_numpy().reshape(-1,1)

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_2d, Y)
preds = lin_reg.predict(X_2d)

### Analysis Results

In [None]:
lin_reg.coef_

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

np.sqrt(mean_squared_error(Y, preds))

## Correlation

Using the Pearson correlation method

In [None]:
from scipy.stats import pearsonr 

corr, a = pearsonr(X, Y)
corr

By the Cohen's standard, the correlation shows a *large association*.

## Outlier Detection

In [None]:
import statsmodels.api as sm

model = sm.OLS(Y, X)
results = model.fit()
results.summary()

Measuring influence

In [None]:
from statsmodels.stats.outliers_influence import OLSInfluence

influence = OLSInfluence(results)
sresiduals = influence.resid_studentized_internal

In [None]:
# Point farther away from line
outlier = data.loc[sresiduals.idxmin(), :]
outlier

In [None]:
import plotly.express as px

data_copy = data.copy()
data_copy['Influence'] = influence.hat_matrix_diag
data_copy['Sresidual'] = influence.resid_studentized_internal
data_copy['Distance'] = influence.cooks_distance[0]
px.scatter(data_copy, x='Influence', y='Sresidual', hover_data=['Country or region', 'GDP per capita', 'Score', 'Influence', 'Sresidual'])

In [None]:
px.scatter(data_copy, x='GDP per capita', y='Score', size='Distance', hover_data=['Country or region', 'GDP per capita', 'Score'])

In [None]:
px.line(data_copy, x='Overall rank', y='Distance', title='Cook\'s Distance', hover_data=['Country or region', 'GDP per capita', 'Score'])

In [None]:
data_copy.sort_values(by='Distance', ascending=False)

In [None]:
cookd_threshold = data_copy['Distance'].mean() * 3

In [None]:
cookd_threshold = 4/(len(data_copy))

In [None]:
outliers = data_copy[data_copy['Distance'] > cookd_threshold]
outliers

In [None]:
plt.close()
plt.scatter(x=X, y=Y, s=3)
plt.xlabel('GDP per capita')
plt.ylabel('Score')
plt.scatter(x=outliers['GDP per capita'], y=outliers['Score'], s=4, c='r', marker='x')

In [None]:
data_no_outliers = data_copy.drop(outliers.index)

In [None]:
X, Y = data_no_outliers['GDP per capita'], data_no_outliers['Score']
# X must be two dimensional for linear regression
X_2d = X.to_numpy().reshape(-1,1)
lin_reg = LinearRegression()
lin_reg.fit(X_2d, Y)
preds = lin_reg.predict(X_2d)

In [None]:
np.sqrt(mean_squared_error(Y, preds))

In [None]:
corr, a = pearsonr(X, Y)
corr

## Using studentized residuals for outliers

In [None]:
data_outliers = data_copy.sort_values(by='Sresidual', ascending=False)
outliers = data_outliers[abs(data_outliers['Sresidual']) > 2]
outliers

In [None]:
plt.close()
plt.scatter(x=X, y=Y, s=3)
plt.xlabel('GDP per capita')
plt.ylabel('Score')

for i in range(len(outliers)):
    point = outliers.iloc[i,:]
    plt.plot(point['GDP per capita'], point['Score'])
    plt.text(x=point['GDP per capita'], y=point['Score'], s=point['Country or region'])

# plt.scatter(x=outliers['GDP per capita'], y=outliers['Score'], s=4, c='r', marker='x')