In [4]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib
plt.style.use('ggplot')
from matplotlib.pyplot import figure


%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,8)

pd.options.mode.chained_assignment = None

In [229]:
# check for missing value and dropping missing variables
def percentMissing(df):
    
    df_numeric = df.select_dtypes(include=[np.number])
    numeric_cols = df_numeric.columns.values
    print(numeric_cols)
    
    # % of missing data
    for col in df.columns:
        # create missing indicator for features with missing data
        missing = df[col].isnull()
        pct_missing = np.mean(missing)*100
        print('{} - {}%'.format(col, round(pct_missing)))
        num_missing = np.sum(missing)
    
        if num_missing > 0:  
            df['{}_ismissing'.format(col)] = missing
        
    ismissing_cols = [col for col in df.columns if 'ismissing' in col]
    df['num_missing'] = df[ismissing_cols].sum(axis=1)
    
    # dropping row with at least one missing feature
    ind_missing = df[df['num_missing']>1].index
    clean_df = df.drop(ind_missing, axis=0)
    
    return clean_df



In [270]:
# subsetting data for only two country
def subsetdata(df):
    
    # filtering for yearly value
    df_filtered = df[df['FREQUENCY'] =='A']
    df_filtered['TIME'] = pd.to_numeric(df_filtered['TIME'])
    
    # selecting colums only for Location, Time, and value
    df1 = df_filtered[['LOCATION', 'TIME', 'Value']]
    
    # subsetting dataset for only US & JAP from year 1975 - 2017
    usa = df1[(df1.LOCATION == 'USA') & (df1.TIME >=1975)]
    jpn = df1[(df1.LOCATION == 'JPN') & (df1.TIME >=1975)]
    df2 = pd.concat([usa, jpn])

    
    df3 = percentMissing(df2)
    
    df4 = df3[['LOCATION', 'TIME', 'Value']]
    
    # droppong values because I need only data till 2017
    df5 = df4[df4['TIME'] < 2018]
    
    index = df5.index
    number_of_rows = len(index)
    print("total_rows: {}". format(number_of_rows))
            
    return df5

In [271]:
# checking data type
def Datatype(df):
    # shape and data types of the data
    print(df.shape)
    print(df.dtypes)

    # select numeric columns
    df_numeric = df.select_dtypes(include=[np.number])
    numeric_cols = df_numeric.columns.values
    print(numeric_cols)

    # select non numeric columns
    df_non_numeric = df.select_dtypes(exclude=[np.number])
    non_numeric_cols = df_non_numeric.columns.values
    print(non_numeric_cols)

In [316]:
def DataImporting():
    # read the data

    corp_tax_df = pd.read_csv('Corporate TAX.csv')
    interest_df = pd.read_csv('long-term interest rates.csv')
    uemp_df = pd.read_csv('UnemploymentRate.csv')
    income_tax_df = pd.read_csv('Tax_Personal_Income.csv')
    GDP_df = pd.read_csv('GDP.csv')
    CPI_df = pd.read_csv('CPI.csv')
    
    corp_tax = subsetdata(corp_tax_df)
    corp_tax = corp_tax.rename(columns={'Value': 'Corporate Tax'})
    
    interest = subsetdata(interest_df)
    interest = interest.rename(columns={'Value': 'Interest Rates'})
    
    uemp = subsetdata(uemp_df)
    uemp = uemp.rename(columns={'Value': 'Unemployment Rates'})
    
    income_tax = subsetdata(income_tax_df)
    income_tax = income_tax.rename(columns={'Value': 'Income Tax Rates'})
    
    GDP = subsetdata(GDP_df)
    GDP = GDP.rename(columns={'Value': 'GDP Growth'})
    
    CPI = subsetdata(CPI_df)
    CPI = CPI.rename(columns={'Value': 'CPI'})
    
    merged_data = pd.merge(corp_tax, interest, how='left', on=['LOCATION', 'TIME']).merge(uemp).merge(income_tax).merge(GDP).merge(CPI)
    merged_data = merged_data.fillna(0)
                     
    return merged_data


In [318]:
econ_df = DataImporting()
econ_df

['TIME' 'Value']
LOCATION - 0%
TIME - 0%
Value - 0%
total_rows: 86
['TIME' 'Value']
LOCATION - 0%
TIME - 0%
Value - 0%
total_rows: 72
['TIME' 'Value']
LOCATION - 0%
TIME - 0%
Value - 0%
total_rows: 86
['TIME' 'Value']
LOCATION - 0%
TIME - 0%
Value - 0%
total_rows: 86
['TIME' 'Value']
LOCATION - 0%
TIME - 0%
Value - 0%
total_rows: 86
['TIME' 'Value']
LOCATION - 0%
TIME - 0%
Value - 0%
total_rows: 86


Unnamed: 0,LOCATION,TIME,Corporate Tax,Interest Rates,Unemployment Rates,Income Tax Rates,GDP Growth,CPI
0,USA,1975,34.648,7.987500,8.466666,34.648,7801.167695,9.143147
1,USA,1976,34.727,7.611667,7.716667,34.727,8590.244216,5.744812
2,USA,1977,35.358,7.419167,7.066667,35.358,9450.431025,6.501684
3,USA,1978,36.406,8.410000,6.066667,36.406,10562.860180,7.630964
4,USA,1979,38.362,9.442500,5.833333,38.362,11671.541410,11.254470
...,...,...,...,...,...,...,...,...
81,JPN,2013,19.228,0.689667,4.025000,19.228,39008.360560,0.346440
82,JPN,2014,18.902,0.520333,3.591667,18.902,39183.465680,2.761954
83,JPN,2015,18.863,0.350000,3.375000,18.863,40406.095330,0.789518
84,JPN,2016,18.592,-0.066250,3.116667,18.592,39990.049450,-0.116667
