# Linear Regression

## Import libraries and data

In [98]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns
from seaborn import regplot

import statsmodels.formula.api as smf
import statsmodels.api as sm
import statsmodels.stats.multicomp as multi

import scipy
from scipy import stats
from scipy.stats import pearsonr

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

import researchpy as rp

import plotly.express as px

In [99]:
from gapminder import gapminder

In [103]:
df = gapminder

In [105]:
df = df.rename(columns={'lifeExp':'life_exp', 'pop':'population', 'gdpPercap':'gdp_per_cap'})

In [107]:
df.head()

Unnamed: 0,country,continent,year,life_exp,population,gdp_per_cap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


## Implement linear regression model

In [110]:
X = df.drop(columns=['country', 'continent', 'gdp_per_cap'])

In [111]:
Y = df.gdp_per_cap

### statsmodels

#### Constant variable

In [112]:
X = sm.add_constant(X)

In [113]:
model_constant = sm.OLS(Y,X)

In [114]:
results_constant = model_constant.fit()

In [115]:
results_constant.summary()

0,1,2,3
Dep. Variable:,gdp_per_cap,R-squared:,0.345
Model:,OLS,Adj. R-squared:,0.344
Method:,Least Squares,F-statistic:,299.1
Date:,"Sat, 25 Jun 2022",Prob (F-statistic):,6.91e-156
Time:,08:55:46,Log-Likelihood:,-17726.0
No. Observations:,1704,AIC:,35460.0
Df Residuals:,1700,BIC:,35480.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.304e+04,2.43e+04,0.537,0.591,-3.46e+04,6.06e+04
year,-16.6235,12.469,-1.333,0.183,-41.079,7.832
life_exp,458.2072,16.644,27.529,0.000,425.562,490.853
population,-5.776e-06,1.83e-06,-3.158,0.002,-9.36e-06,-2.19e-06

0,1,2,3
Omnibus:,1946.747,Durbin-Watson:,0.33
Prob(Omnibus):,0.0,Jarque-Bera (JB):,264041.825
Skew:,5.623,Prob(JB):,0.0
Kurtosis:,62.937,Cond. No.,13800000000.0


#### No constant variable

In [116]:
X = df.drop(columns=['country', 'continent', 'gdp_per_cap'])

In [117]:
Y = df.gdp_per_cap

In [118]:
model_no_constant = sm.OLS(Y,X)

In [119]:
results_no_constant = model_no_constant.fit()

In [120]:
results_no_constant.summary()

0,1,2,3
Dep. Variable:,gdp_per_cap,R-squared (uncentered):,0.574
Model:,OLS,Adj. R-squared (uncentered):,0.573
Method:,Least Squares,F-statistic:,763.4
Date:,"Sat, 25 Jun 2022",Prob (F-statistic):,2.36e-314
Time:,08:56:31,Log-Likelihood:,-17726.0
No. Observations:,1704,AIC:,35460.0
Df Residuals:,1701,BIC:,35480.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
year,-9.9277,0.468,-21.206,0.000,-10.846,-9.010
life_exp,454.6355,15.257,29.799,0.000,424.712,484.559
population,-5.835e-06,1.83e-06,-3.197,0.001,-9.42e-06,-2.25e-06

0,1,2,3
Omnibus:,1949.386,Durbin-Watson:,0.329
Prob(Omnibus):,0.0,Jarque-Bera (JB):,266428.797
Skew:,5.634,Prob(JB):,0.0
Kurtosis:,63.213,Cond. No.,8700000.0


### sklearn

In [121]:
# Initiate model
model = LinearRegression()

In [122]:
# Fit the training data
results = model.fit(X, Y)

In [123]:
results.coef_

array([-1.66235077e+01,  4.58207190e+02, -5.77607022e-06])

In [124]:
results.intercept_

13040.923516607041

In [125]:
results.get_params()

{'copy_X': True,
 'fit_intercept': True,
 'n_jobs': None,
 'normalize': False,
 'positive': False}

In [None]:
y_prediction =  model.predict(x_test)

#### Predicting the accuracy score
score=r2_score(y_test,y_prediction)

score
mean_squared_error(y_test,y_prediction)
np.sqrt(mean_squared_error(y_test,y_prediction))

### Visualize correlation

In [126]:
fig = px.scatter(df, x='year', y='gdp_per_cap')
fig.show()

In [127]:
fig = px.scatter(df, x='life_exp', y='gdp_per_cap')
fig.show()

In [128]:
fig = px.scatter(df, x='population', y='gdp_per_cap')
fig.show()

### Include dummy variables for continent

In [135]:
df.continent.value_counts()

Africa      624
Asia        396
Europe      360
Americas    300
Oceania      24
Name: continent, dtype: int64

In [141]:
df_dummy = df.drop(columns=['country'])

In [142]:
df_dummy = pd.get_dummies(df_dummy)

In [143]:
df_dummy.head()

Unnamed: 0,year,life_exp,population,gdp_per_cap,continent_Africa,continent_Americas,continent_Asia,continent_Europe,continent_Oceania
0,1952,28.801,8425333,779.445314,0,0,1,0,0
1,1957,30.332,9240934,820.85303,0,0,1,0,0
2,1962,31.997,10267083,853.10071,0,0,1,0,0
3,1967,34.02,11537966,836.197138,0,0,1,0,0
4,1972,36.088,13079460,739.981106,0,0,1,0,0


In [147]:
X_dummy = df_dummy.drop(columns=['gdp_per_cap'])

In [148]:
Y_dummy = df_dummy.gdp_per_cap

In [149]:
X_dummy = sm.add_constant(X_dummy)

In [161]:
model_constant = sm.OLS(Y_dummy,X_dummy)

In [151]:
results_constant = model_constant.fit()

In [152]:
results_constant.summary()

0,1,2,3
Dep. Variable:,gdp_per_cap,R-squared:,0.369
Model:,OLS,Adj. R-squared:,0.366
Method:,Least Squares,F-statistic:,141.4
Date:,"Sat, 25 Jun 2022",Prob (F-statistic):,2.66e-164
Time:,09:06:14,Log-Likelihood:,-17696.0
No. Observations:,1704,AIC:,35410.0
Df Residuals:,1696,BIC:,35450.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.292e+04,2.22e+04,-1.034,0.301,-6.64e+04,2.06e+04
year,6.5271,13.939,0.468,0.640,-20.812,33.866
life_exp,388.1655,26.037,14.908,0.000,337.097,439.234
population,-6.416e-06,1.86e-06,-3.454,0.001,-1.01e-05,-2.77e-06
continent_Africa,-6707.6449,4688.123,-1.431,0.153,-1.59e+04,2487.469
continent_Americas,-7802.1451,4450.063,-1.753,0.080,-1.65e+04,926.046
continent_Asia,-4915.8884,4509.084,-1.090,0.276,-1.38e+04,3928.066
continent_Europe,-3328.0795,4345.070,-0.766,0.444,-1.19e+04,5194.183
continent_Oceania,-169.5052,4494.343,-0.038,0.970,-8984.547,8645.536

0,1,2,3
Omnibus:,1996.307,Durbin-Watson:,0.332
Prob(Omnibus):,0.0,Jarque-Bera (JB):,299082.688
Skew:,5.848,Prob(JB):,0.0
Kurtosis:,66.841,Cond. No.,5.55e+23


### Include dummy variables for country

In [153]:
df_dummy_country = pd.get_dummies(df)

In [156]:
df_dummy_country.head()

Unnamed: 0,year,life_exp,population,gdp_per_cap,country_Afghanistan,country_Albania,country_Algeria,country_Angola,country_Argentina,country_Australia,...,country_Vietnam,country_West Bank and Gaza,"country_Yemen, Rep.",country_Zambia,country_Zimbabwe,continent_Africa,continent_Americas,continent_Asia,continent_Europe,continent_Oceania
0,1952,28.801,8425333,779.445314,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1957,30.332,9240934,820.85303,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1962,31.997,10267083,853.10071,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1967,34.02,11537966,836.197138,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,1972,36.088,13079460,739.981106,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [158]:
df_dummy_country = df_dummy_country.drop(columns=['continent_Africa', 'continent_Americas', 'continent_Asia', 'continent_Europe', 'continent_Oceania'])

In [159]:
X_dummy_country = df_dummy_country.drop(columns=['gdp_per_cap'])

In [160]:
Y_dummy_country = df_dummy_country.gdp_per_cap

In [162]:
model_constant = sm.OLS(Y_dummy_country, X_dummy_country)

In [163]:
results_constant = model_constant.fit()

In [164]:
results_constant.summary()

0,1,2,3
Dep. Variable:,gdp_per_cap,R-squared:,0.791
Model:,OLS,Adj. R-squared:,0.771
Method:,Least Squares,F-statistic:,40.89
Date:,"Sat, 25 Jun 2022",Prob (F-statistic):,0.0
Time:,09:11:52,Log-Likelihood:,-16755.0
No. Observations:,1704,AIC:,33800.0
Df Residuals:,1559,BIC:,34590.0
Df Model:,144,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
year,176.9106,12.709,13.921,0.000,151.983,201.838
life_exp,-137.6640,33.705,-4.084,0.000,-203.775,-71.553
population,-4.467e-06,3.87e-06,-1.153,0.249,-1.21e-05,3.13e-06
country_Afghanistan,-3.442e+05,2.41e+04,-14.256,0.000,-3.92e+05,-2.97e+05
country_Albania,-3.375e+05,2.33e+04,-14.490,0.000,-3.83e+05,-2.92e+05
country_Algeria,-3.376e+05,2.35e+04,-14.335,0.000,-3.84e+05,-2.91e+05
country_Angola,-3.413e+05,2.41e+04,-14.145,0.000,-3.89e+05,-2.94e+05
country_Argentina,-3.316e+05,2.33e+04,-14.249,0.000,-3.77e+05,-2.86e+05
country_Australia,-3.199e+05,2.31e+04,-13.835,0.000,-3.65e+05,-2.75e+05

0,1,2,3
Omnibus:,853.79,Durbin-Watson:,0.557
Prob(Omnibus):,0.0,Jarque-Bera (JB):,75193.195
Skew:,1.43,Prob(JB):,0.0
Kurtosis:,35.417,Cond. No.,270000000000.0
