## step 0: import data and reanding it

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns 
import statsmodels.api as sm


In [17]:
plt.style.use('ggplot')
# pd.set_option('max_columns', 200) 

In [3]:
print(sm.datasets.__all__)


['anes96', 'cancer', 'committee', 'ccard', 'copper', 'cpunish', 'elnino', 'engel', 'grunfeld', 'interest_inflation', 'longley', 'macrodata', 'modechoice', 'nile', 'randhie', 'scotland', 'spector', 'stackloss', 'star98', 'strikes', 'sunspots', 'fair', 'heart', 'statecrime', 'co2', 'fertility', 'china_smoking', 'get_rdataset', 'get_data_home', 'clear_data_home', 'webuse', 'check_internet', 'test', 'danish_data']


In [10]:
macro_data = sm.datasets.macrodata.load().data

In [11]:
interest_inflation_dt.head().shape

(5, 14)

In [12]:
print(interest_inflation_dt.columns)

Index(['year', 'quarter', 'realgdp', 'realcons', 'realinv', 'realgovt',
       'realdpi', 'cpi', 'm1', 'tbilrate', 'unemp', 'pop', 'infl', 'realint'],
      dtype='object')


# Explanation of Column Abbreviations in the `interest_inflation` Dataset

Here’s a breakdown of each abbreviation from the `interest_inflation` dataset:

1. **`year`**: The year in which the data is recorded.

2. **`quarter`**: The quarter of the year (1 for Q1, 2 for Q2, 3 for Q3, and 4 for Q4).

3. **`realgdp`**: **Real Gross Domestic Product (GDP)** - A measure of a country’s economic output adjusted for inflation.

4. **`realcons`**: **Real Consumption** - Represents the consumption of goods and services by households, adjusted for inflation.

5. **`realinv`**: **Real Investment** - Refers to business investments in capital (e.g., machinery, buildings) adjusted for inflation.

6. **`realgovt`**: **Real Government Expenditure** - Measures government spending on goods and services, adjusted for inflation.

7. **`realdpi`**: **Real Disposable Personal Income** - The income households have after taxes, adjusted for inflation.

8. **`cpi`**: **Consumer Price Index (CPI)** - Measures the average change over time in the prices paid by consumers for a basket of goods and services.

9. **`m1`**: **M1 Money Supply** - Represents the most liquid portions of the money supply, including physical currency and demand deposits.

10. **`tbilrate`**: **Treasury Bill Rate** - The interest rate on short-term U.S. government debt securities.

11. **`unemp`**: **Unemployment Rate** - The percentage of the labor force that is unemployed and actively seeking employment.

12. **`pop`**: **Population** - Indicates the total population of the country or region under study.

13. **`infl`**: **Inflation Rate** - The rate at which the general level of prices for goods and services rises.

14. **`realint`**: **Real Interest Rate** - The nominal interest rate adjusted for inflation.

### Summary
- `realgdp`, `realcons`, `realinv`, `realgovt`, `realdpi`: "Real" means these are inflation-adjusted values.
- `cpi`, `infl`: Both relate to inflation.
- `m1`: Refers to the liquid part of the money supply.
- `tbilrate`: Relates to interest rates on Treasury bills.
- `unemp`: The unemployment rate, a common economic indicator.


## understand the data 
`DataFarame`: shape
`head`: and tail
`dtypes`
`decsribe`

In [13]:
macro_data.shape

(203, 14)

In [15]:
macro_data.head(5)

Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint
0,1959.0,1.0,2710.349,1707.4,286.898,470.045,1886.9,28.98,139.7,2.82,5.8,177.146,0.0,0.0
1,1959.0,2.0,2778.801,1733.7,310.859,481.301,1919.7,29.15,141.7,3.08,5.1,177.83,2.34,0.74
2,1959.0,3.0,2775.488,1751.8,289.226,491.26,1916.4,29.35,140.5,3.82,5.3,178.657,2.74,1.09
3,1959.0,4.0,2785.204,1753.7,299.356,484.052,1931.3,29.37,140.0,4.33,5.6,179.386,0.27,4.06
4,1960.0,1.0,2847.699,1770.5,331.722,462.199,1955.5,29.54,139.6,3.5,5.2,180.007,2.31,1.19


In [18]:
# to display all the columns 
macro_data.columns

Index(['year', 'quarter', 'realgdp', 'realcons', 'realinv', 'realgovt',
       'realdpi', 'cpi', 'm1', 'tbilrate', 'unemp', 'pop', 'infl', 'realint'],
      dtype='object')

In [20]:
# every columns is a series and each one have there own data types
macro_data.dtypes

year        float64
quarter     float64
realgdp     float64
realcons    float64
realinv     float64
realgovt    float64
realdpi     float64
cpi         float64
m1          float64
tbilrate    float64
unemp       float64
pop         float64
infl        float64
realint     float64
dtype: object

In [22]:
macro_data.describe()

Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint
count,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0
mean,1983.876847,2.492611,7221.171901,4825.293103,1012.863862,663.32864,5310.540887,105.075788,667.927586,5.311773,5.884729,239.724153,3.96133,1.336502
std,14.686817,1.118563,3214.956044,2313.346192,585.102267,140.863655,2423.515977,61.278878,455.346381,2.803071,1.458574,37.39045,3.253216,2.668799
min,1959.0,1.0,2710.349,1707.4,259.764,460.4,1886.9,28.98,139.6,0.12,3.4,177.146,-8.79,-6.79
25%,1971.0,1.5,4440.1035,2874.1,519.1475,527.9595,3276.95,41.05,228.65,3.515,4.9,208.631,2.27,-0.085
50%,1984.0,2.0,6559.594,4299.9,896.21,662.412,4959.4,104.1,540.9,5.01,5.7,236.348,3.24,1.34
75%,1996.5,3.0,9629.3465,6398.15,1436.6815,773.049,6977.85,159.65,1102.1,6.665,6.8,271.7215,4.975,2.63
max,2009.0,4.0,13415.266,9363.6,2264.721,1044.088,10077.5,218.61,1673.9,15.33,10.7,308.013,14.62,10.95


## Data preparation 
- Droping inrelevant columns and rows
- identify duplacate columns
- Renaming columns
- feature creation

In [24]:
macro_data.tail()

Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint
198,2008.0,3.0,13324.6,9267.7,1990.693,991.551,9838.3,216.889,1474.7,1.17,6.0,305.27,-3.16,4.33
199,2008.0,4.0,13141.92,9195.3,1857.661,1007.273,9920.4,212.174,1576.5,0.12,6.9,305.952,-8.79,8.91
200,2009.0,1.0,12925.41,9209.2,1558.494,996.287,9926.4,212.671,1592.8,0.22,8.1,306.547,0.94,-0.71
201,2009.0,2.0,12901.504,9189.0,1456.678,1023.528,10077.5,214.469,1653.6,0.18,9.2,307.226,3.37,-3.19
202,2009.0,3.0,12990.341,9256.0,1486.398,1044.088,10040.6,216.385,1673.9,0.12,9.6,308.013,3.56,-3.44


## drop columns 
- use drop methode to drop some columns .drop()
- nd also use copy() at the end to let know python this is a copy of data set and not a reference

In [26]:
macro_data.drop(['m1'], axis=1)

Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,tbilrate,unemp,pop,infl,realint
0,1959.0,1.0,2710.349,1707.4,286.898,470.045,1886.9,28.980,2.82,5.8,177.146,0.00,0.00
1,1959.0,2.0,2778.801,1733.7,310.859,481.301,1919.7,29.150,3.08,5.1,177.830,2.34,0.74
2,1959.0,3.0,2775.488,1751.8,289.226,491.260,1916.4,29.350,3.82,5.3,178.657,2.74,1.09
3,1959.0,4.0,2785.204,1753.7,299.356,484.052,1931.3,29.370,4.33,5.6,179.386,0.27,4.06
4,1960.0,1.0,2847.699,1770.5,331.722,462.199,1955.5,29.540,3.50,5.2,180.007,2.31,1.19
...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,2008.0,3.0,13324.600,9267.7,1990.693,991.551,9838.3,216.889,1.17,6.0,305.270,-3.16,4.33
199,2008.0,4.0,13141.920,9195.3,1857.661,1007.273,9920.4,212.174,0.12,6.9,305.952,-8.79,8.91
200,2009.0,1.0,12925.410,9209.2,1558.494,996.287,9926.4,212.671,0.22,8.1,306.547,0.94,-0.71
201,2009.0,2.0,12901.504,9189.0,1456.678,1023.528,10077.5,214.469,0.18,9.2,307.226,3.37,-3.19


## convert a specific columns
-- using df['random_columns'].to_datetime()

In [28]:
macro_data.dtypes

year        float64
quarter     float64
realgdp     float64
realcons    float64
realinv     float64
realgovt    float64
realdpi     float64
cpi         float64
m1          float64
tbilrate    float64
unemp       float64
pop         float64
infl        float64
realint     float64
dtype: object

In [29]:
## Rename columns

In [30]:
macro_data.head()

Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint
0,1959.0,1.0,2710.349,1707.4,286.898,470.045,1886.9,28.98,139.7,2.82,5.8,177.146,0.0,0.0
1,1959.0,2.0,2778.801,1733.7,310.859,481.301,1919.7,29.15,141.7,3.08,5.1,177.83,2.34,0.74
2,1959.0,3.0,2775.488,1751.8,289.226,491.26,1916.4,29.35,140.5,3.82,5.3,178.657,2.74,1.09
3,1959.0,4.0,2785.204,1753.7,299.356,484.052,1931.3,29.37,140.0,4.33,5.6,179.386,0.27,4.06
4,1960.0,1.0,2847.699,1770.5,331.722,462.199,1955.5,29.54,139.6,3.5,5.2,180.007,2.31,1.19


In [33]:
new_macro_dt =  macro_data.rename(columns={'unemp': 'Unemp',
                           'm1': 'M1'})

In [34]:
new_macro_dt.head()

Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,M1,tbilrate,Unemp,pop,infl,realint
0,1959.0,1.0,2710.349,1707.4,286.898,470.045,1886.9,28.98,139.7,2.82,5.8,177.146,0.0,0.0
1,1959.0,2.0,2778.801,1733.7,310.859,481.301,1919.7,29.15,141.7,3.08,5.1,177.83,2.34,0.74
2,1959.0,3.0,2775.488,1751.8,289.226,491.26,1916.4,29.35,140.5,3.82,5.3,178.657,2.74,1.09
3,1959.0,4.0,2785.204,1753.7,299.356,484.052,1931.3,29.37,140.0,4.33,5.6,179.386,0.27,4.06
4,1960.0,1.0,2847.699,1770.5,331.722,462.199,1955.5,29.54,139.6,3.5,5.2,180.007,2.31,1.19


In [35]:
# identify missing or null value

In [37]:
new_macro_dt.isna().sum()

year        0
quarter     0
realgdp     0
realcons    0
realinv     0
realgovt    0
realdpi     0
cpi         0
M1          0
tbilrate    0
Unemp       0
pop         0
infl        0
realint     0
dtype: int64

In [39]:
# identify duplacate 
new_macro_dt.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
198    False
199    False
200    False
201    False
202    False
Length: 203, dtype: bool

## step 3: Features Understanding
(Univariable analysis)
- ploting Feature distribution
- histogram
- KDE
- Boxplot