## Investment

In [1]:
import pandas as pd
import numpy as np

In [2]:
investment = pd.read_csv('../data/investment_gfcf.csv')
print(investment.shape)
investment.head()

(24513, 8)


Unnamed: 0,LOCATION,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value,Flag Codes
0,AUS,GFCF,TOT,AGRWTH,A,1960,5.675337,
1,AUS,GFCF,TOT,AGRWTH,A,1961,1.774823,
2,AUS,GFCF,TOT,AGRWTH,A,1962,6.253368,
3,AUS,GFCF,TOT,AGRWTH,A,1963,9.366632,
4,AUS,GFCF,TOT,AGRWTH,A,1964,7.482981,


In [3]:
investment.MEASURE.unique()

array(['AGRWTH', 'MLN_USD', 'QGRWTH'], dtype=object)

In [4]:
investment.FREQUENCY.unique()

array(['A', 'Q'], dtype=object)

In [5]:
investment.SUBJECT.unique()

array(['TOT'], dtype=object)

This dataset contains our data for the investment made by each country. To start with, we can drop the column `Flag Codes` since it has a lot of NaN values, and is not of any use to us. 

In [6]:
investment = investment.drop(['Flag Codes'], axis=1)

For `FREQUENCY`, we only need the data annually, so we drop our rows with `FREQUENCY==Q`

In [7]:
investment = investment[(investment['FREQUENCY']=='A')]

For `MEASURE`, we are only interested in the values represented in Millions of USD, so we only keep the rows that have `MEASURE==MLN_USD`

In [8]:
investment = investment[(investment['MEASURE']=='MLN_USD')]

In [9]:
investment.head()

Unnamed: 0,LOCATION,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value
62,AUS,GFCF,TOT,MLN_USD,A,1960,7594.023
63,AUS,GFCF,TOT,MLN_USD,A,1961,7555.709
64,AUS,GFCF,TOT,MLN_USD,A,1962,8263.204
65,AUS,GFCF,TOT,MLN_USD,A,1963,9144.787
66,AUS,GFCF,TOT,MLN_USD,A,1964,10182.874


In [10]:
investment.nunique()

LOCATION       67
INDICATOR       1
SUBJECT         1
MEASURE         1
FREQUENCY       1
TIME           62
Value        2606
dtype: int64

Now, we can drop the columns `INDICATOR`, `SUBJECT` and `FREQUENCY`, since don't need them anymore. We also reset our index, since we have deleted a lot of rows.

In [11]:
investment = investment.drop(['INDICATOR','SUBJECT','FREQUENCY'], axis=1)
investment.reset_index(drop=True, inplace=True)

In [12]:
investment.head()

Unnamed: 0,LOCATION,MEASURE,TIME,Value
0,AUS,MLN_USD,1960,7594.023
1,AUS,MLN_USD,1961,7555.709
2,AUS,MLN_USD,1962,8263.204
3,AUS,MLN_USD,1963,9144.787
4,AUS,MLN_USD,1964,10182.874


In [13]:
investment.dtypes

LOCATION     object
MEASURE      object
TIME         object
Value       float64
dtype: object

In [14]:
investment['LOCATION']= investment['LOCATION'].astype('string')
investment['MEASURE']= investment['MEASURE'].astype('string')
investment['TIME']= investment['TIME'].astype('string')

In [15]:
investment.dtypes

LOCATION     string
MEASURE      string
TIME         string
Value       float64
dtype: object

In [16]:
investment.shape

(2606, 4)

In [17]:
highestmin = 1
highestmincountry = ''
lowestmax = 9999
lowestmaxcountry = ''

for country in investment.LOCATION.unique():
    tempdf = investment[investment['LOCATION']==country]
    minyear = int(tempdf.TIME.min())
    if(highestmin<minyear):
        highestmin=minyear
        highestmincountry = country
        
    maxyear = int(tempdf.TIME.max())
    if(lowestmax>maxyear):
        lowestmax=maxyear
        lowestmaxcountry=country
    missingyear = []
    for i in range(minyear,maxyear):
        if((tempdf['TIME']==str(i)).any() == False):
            tempseries = pd.Series([str(i)])
            missingyear = pd.concat([missingyear,tempseries])
            missingyear.reset_index(inplace=True, drop = True)
    
    print('Country = ', country, 'Minimum Year = ',minyear, 'Maximum Year = ', maxyear, 'Missing Years = ', missingyear)
    
print('Highest Min = ', highestmin, ' Country = ', highestmincountry)
print('Lowest Max = ', lowestmax, ' Country = ', lowestmaxcountry)

Country =  AUS Minimum Year =  1960 Maximum Year =  2021 Missing Years =  []
Country =  AUT Minimum Year =  1970 Maximum Year =  2021 Missing Years =  []
Country =  BEL Minimum Year =  1970 Maximum Year =  2021 Missing Years =  []
Country =  CAN Minimum Year =  1961 Maximum Year =  2021 Missing Years =  []
Country =  CZE Minimum Year =  1990 Maximum Year =  2021 Missing Years =  []
Country =  DNK Minimum Year =  1966 Maximum Year =  2021 Missing Years =  []
Country =  FIN Minimum Year =  1970 Maximum Year =  2021 Missing Years =  []
Country =  FRA Minimum Year =  1960 Maximum Year =  2021 Missing Years =  []
Country =  DEU Minimum Year =  1970 Maximum Year =  2021 Missing Years =  []
Country =  GRC Minimum Year =  1960 Maximum Year =  2021 Missing Years =  []
Country =  HUN Minimum Year =  1991 Maximum Year =  2021 Missing Years =  []
Country =  ISL Minimum Year =  1970 Maximum Year =  2021 Missing Years =  []
Country =  IRL Minimum Year =  1970 Maximum Year =  2021 Missing Years =  []