In [64]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import StandardScaler

In [65]:
df = pd.read_csv('Country_Dataset.csv')
df.head()

Unnamed: 0,Country,child_mort,exports,health,imports,income,inflation,life_expec,total_fer,gdpp
0,Afghanistan,90.2,10.0,7.58,44.9,1610,9.44,56.2,5.82,553
1,Albania,16.6,28.0,6.55,48.6,9930,4.49,76.3,1.65,4090
2,Algeria,27.3,38.4,4.17,31.4,12900,16.1,76.5,2.89,4460
3,Angola,119.0,62.3,2.85,42.9,5900,22.4,60.1,6.16,3530
4,Antigua and Barbuda,10.3,45.5,6.03,58.9,19100,1.44,76.8,2.13,12200


In [66]:
df.shape

(167, 10)

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167 entries, 0 to 166
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Country     167 non-null    object 
 1   child_mort  167 non-null    float64
 2   exports     167 non-null    float64
 3   health      167 non-null    float64
 4   imports     167 non-null    float64
 5   income      167 non-null    int64  
 6   inflation   167 non-null    float64
 7   life_expec  167 non-null    float64
 8   total_fer   167 non-null    float64
 9   gdpp        167 non-null    int64  
dtypes: float64(7), int64(2), object(1)
memory usage: 13.2+ KB


In [68]:
df.isnull().sum()

Country       0
child_mort    0
exports       0
health        0
imports       0
income        0
inflation     0
life_expec    0
total_fer     0
gdpp          0
dtype: int64

In [69]:
df.describe()

Unnamed: 0,child_mort,exports,health,imports,income,inflation,life_expec,total_fer,gdpp
count,167.0,167.0,167.0,167.0,167.0,167.0,167.0,167.0,167.0
mean,38.27006,41.108976,6.815689,46.890215,17144.688623,7.781832,70.555689,2.947964,12964.155689
std,40.328931,27.41201,2.746837,24.209589,19278.067698,10.570704,8.893172,1.513848,18328.704809
min,2.6,0.109,1.81,0.0659,609.0,-4.21,32.1,1.15,231.0
25%,8.25,23.8,4.92,30.2,3355.0,1.81,65.3,1.795,1330.0
50%,19.3,35.0,6.32,43.3,9960.0,5.39,73.1,2.41,4660.0
75%,62.1,51.35,8.6,58.75,22800.0,10.75,76.8,3.88,14050.0
max,208.0,200.0,17.9,174.0,125000.0,104.0,82.8,7.49,105000.0


In [70]:
df.duplicated().sum()

0

In [71]:
data = df.copy()

In [72]:
X = data.drop(columns = ['income', 'Country'], axis = 1)
y = data['income']

In [73]:
scaler = StandardScaler()

In [74]:
X_scaled = scaler.fit_transform(X)

In [75]:
model1 = LinearRegression()

In [76]:
model1.fit(X_scaled, y)

In [77]:
model1.coef_

array([-1310.66232692,  4484.84999533, -2448.5513176 , -2771.26703765,
         514.43735949,  -534.49853753, -1479.59125908, 15630.97948175])

In [78]:
model1.intercept_

17144.688622754493

In [79]:
model1.score(X_scaled, y)

0.8681707347392348

In [80]:
X.columns

Index(['child_mort', 'exports', 'health', 'imports', 'inflation', 'life_expec',
       'total_fer', 'gdpp'],
      dtype='object')

In [81]:
f_regression(X_scaled, y)

(array([ 62.55677722,  60.12237203,   2.81776344,   2.50984859,
          3.68265169,  98.78815168,  55.54207449, 668.53621   ]),
 array([3.53880138e-13, 8.71463303e-13, 9.51190927e-02, 1.15051564e-01,
        5.67073938e-02, 1.55038070e-18, 4.88587488e-12, 6.39671300e-60]))

In [82]:
f_regression(X_scaled, y)[1].round(3)

array([0.   , 0.   , 0.095, 0.115, 0.057, 0.   , 0.   , 0.   ])

In [83]:
p_values = f_regression(X_scaled, y)[1].round(3)
p_values

array([0.   , 0.   , 0.095, 0.115, 0.057, 0.   , 0.   , 0.   ])

In [84]:
data_inf = pd.DataFrame(data = X.columns, columns = ['columns'])
data_inf

Unnamed: 0,columns
0,child_mort
1,exports
2,health
3,imports
4,inflation
5,life_expec
6,total_fer
7,gdpp


In [85]:
data_inf['p_values'] = p_values
data_inf['coefficient'] = model1.coef_

In [86]:
data_inf

Unnamed: 0,columns,p_values,coefficient
0,child_mort,0.0,-1310.662327
1,exports,0.0,4484.849995
2,health,0.095,-2448.551318
3,imports,0.115,-2771.267038
4,inflation,0.057,514.437359
5,life_expec,0.0,-534.498538
6,total_fer,0.0,-1479.591259
7,gdpp,0.0,15630.979482


In [87]:
r2 = model1.score(X_scaled, y)
n = X.shape[0]
p = X.shape[1]

In [88]:
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
adjusted_r2

0.8614958352323606

In [89]:
X2 = data[['child_mort', 'exports', 'life_expec', 'total_fer', 'gdpp']]

In [90]:
X2_scaled = scaler.fit_transform(X2)

In [91]:
model2 = LinearRegression()

In [92]:
model2.fit(X2_scaled, y)

In [93]:
model2.score(X2_scaled, y)

0.8342736952287163

In [94]:
f_regression(X2_scaled, y)

(array([ 62.55677722,  60.12237203,  98.78815168,  55.54207449,
        668.53621   ]),
 array([3.53880138e-13, 8.71463303e-13, 1.55038070e-18, 4.88587488e-12,
        6.39671300e-60]))

In [95]:
p_values2 = f_regression(X2_scaled, y)[1].round(3)
p_values2

array([0., 0., 0., 0., 0.])

In [96]:
model2.coef_

array([ -443.42357248,  3029.6908854 ,   786.73379884,  -958.32448814,
       14822.26135898])

In [97]:
model2.intercept_

17144.688622754493

In [98]:
r2_2 = model2.score(X2_scaled, y)
n2 = X2.shape[0]
p2 = X2.shape[1]

In [99]:
adjusted_r2_2 = 1 - (1 - r2_2) * (n2 - 1) / (n2 - p2 - 1)
adjusted_r2_2

0.8291269155774342

In [100]:
data_inf2 = pd.DataFrame(data = X2.columns, columns = ['columns'])
data_inf2

Unnamed: 0,columns
0,child_mort
1,exports
2,life_expec
3,total_fer
4,gdpp


In [101]:
data_inf2['coefficient'] = model2.coef_
data_inf2['p_values'] = p_values2

In [102]:
data_inf2

Unnamed: 0,columns,coefficient,p_values
0,child_mort,-443.423572,0.0
1,exports,3029.690885,0.0
2,life_expec,786.733799,0.0
3,total_fer,-958.324488,0.0
4,gdpp,14822.261359,0.0


In [109]:
X2.head()

Unnamed: 0,child_mort,exports,life_expec,total_fer,gdpp
0,90.2,10.0,56.2,5.82,553
1,16.6,28.0,76.3,1.65,4090
2,27.3,38.4,76.5,2.89,4460
3,119.0,62.3,60.1,6.16,3530
4,10.3,45.5,76.8,2.13,12200


In [114]:
y[:5]

0     1610
1     9930
2    12900
3     5900
4    19100
Name: income, dtype: int64

In [116]:
new_data = [[90.2, 10, 56.2, 5.82, 553]]

In [119]:
new_data_ndarray = np.array(new_data)

In [121]:
new_data_scaled = scaler.fit_transform(new_data_ndarray)

In [122]:
model2.predict(new_data_scaled)

array([17144.68862275])

In [123]:
predict = model2.predict(new_data_scaled)

In [126]:
print('The actual income for new data is:', y[0])
print('The predicted value for new data is:', predict.round(0))

The actual income for new data is: 1610
The predicted value for new data is: [17145.]
