# 1. Reading multiple data files

In [1]:
from glob import glob
import os
import pandas as pd

In [2]:
def print_function(strFunction):
    '''
    Print string function and its output nicely
    
    Parameter
    ---------
    strFunction: str
        String representation of a function
    
    Return
    ------
    Nicely formatted print output
    '''
    #print(func)
    #print(eval(func))
    print('{0}: {1}'.format(strFunction, eval(strFunction)))

In [3]:
print_function('os.getcwd()')
print_function('os.listdir()')
print_function('os.listdir("data")')

os.getcwd(): /home/anonymous/Documents/github/data-science/pandas/merging-dataframes-with-pandas
os.listdir(): ['.ipynb_checkpoints', 'merging-dataframes-with-pandas.ipynb', 'data']
os.listdir("data"): ['Sales.zip', 'Summer Olympic medals', 'Sales', 'automobiles.csv', 'oil_price.csv', 'sp500.csv', 'pittsburgh2013.csv', 'exchange.csv', 'Baby names', 'GDP.zip', 'GDP', 'Baby names.zip', 'Summer Olympic medals.zip']


In [4]:
def getcwd_csv(cwd):
    '''
    Read all csv in the provided path as [pd.Dataframe, pd.Dataframe, ...]
    
    Parameter
    ---------
    cwd: str
        Path of current working directory

    Return
    ------
    Return [pd.Dataframe, pd.Dataframe, ...]
    '''
    import os
    import pandas as pd
    
    ori = os.getcwd()
    os.chdir(cwd)
    
    result = []
    
    for file in os.listdir():
        if 'csv' in file:
            print('Appending: {0}'.format(file))
            result.append(pd.read_csv(file))

    print('Finished appending, the list length: {0}'.format(len(result)))
    
    if len(result) == 0:
        print('There is 0 pd.Dataframe in the list')
    elif len(result) > 0:
        print('There is {0} pd.Dataframe(s) in the list'.format(len(result)))
    else:
        print('Something is wrong, please check getcwd_csv() function')
    
    os.chdir(ori)

    return result

In [5]:
automobiles, oil_price, sp500, pittsburgh2013, exchange = [df for df in getcwd_csv('data')]

Appending: automobiles.csv
Appending: oil_price.csv
Appending: sp500.csv
Appending: pittsburgh2013.csv
Appending: exchange.csv
Finished appending, the list length: 5
There is 5 pd.Dataframe(s) in the list


In [6]:
print_function('os.getcwd()')
print_function('os.listdir()')
print_function('os.listdir("data")')

os.getcwd(): /home/anonymous/Documents/github/data-science/pandas/merging-dataframes-with-pandas
os.listdir(): ['.ipynb_checkpoints', 'merging-dataframes-with-pandas.ipynb', 'data']
os.listdir("data"): ['Sales.zip', 'Summer Olympic medals', 'Sales', 'automobiles.csv', 'oil_price.csv', 'sp500.csv', 'pittsburgh2013.csv', 'exchange.csv', 'Baby names', 'GDP.zip', 'GDP', 'Baby names.zip', 'Summer Olympic medals.zip']


In [7]:
automobiles, oil_price, sp500, pittsburgh2013, exchange = [pd.read_csv(df) for df in glob('data/*.csv')]
automobiles.shape, oil_price.shape, sp500.shape, pittsburgh2013.shape, exchange.shape

((392, 9), (156, 2), (252, 7), (365, 23), (250, 2))

In [8]:
names1981, names1881 = [pd.read_csv(df) for df in glob('data/Baby names/*.csv')]
names1881.shape, names1981.shape

((1934, 3), (19454, 3))

In [9]:
gdp_china, gdp_usa = [pd.read_csv(df) for df in glob('data/GDP/*.csv')]
gdp_china.shape, gdp_usa.shape

((56, 2), (278, 2))

In [10]:
sales_feb_2015, feb_sales_service, feb_sales_software, sales_jan_2015, sales_mar_2015, feb_sales_hardware = [pd.read_csv(df) for df in glob('data/Sales/*.csv')]
sales_feb_2015.shape, feb_sales_service.shape, feb_sales_software.shape, sales_jan_2015.shape, sales_mar_2015.shape, feb_sales_hardware.shape

((20, 4), (6, 4), (9, 4), (20, 4), (20, 4), (5, 4))

In [11]:
print(glob('data/Summer Olympic medals/*'))
gold, silver_top5, gold_top5, bronze, silver, ioc_country_codes, bronze_top5 = [pd.read_csv(df) for df in glob('data/Summer Olympic medals/*.csv')]
gold.shape, silver_top5.shape, gold_top5.shape, bronze.shape, silver.shape, ioc_country_codes.shape, bronze_top5.shape

['data/Summer Olympic medals/Gold.csv', 'data/Summer Olympic medals/silver_top5.csv', 'data/Summer Olympic medals/gold_top5.csv', 'data/Summer Olympic medals/Summer Olympic medalists 1896 to 2008 - ALL MEDALISTS.tsv', 'data/Summer Olympic medals/Bronze.csv', 'data/Summer Olympic medals/Silver.csv', 'data/Summer Olympic medals/Summer Olympic medalists 1896 to 2008 - IOC COUNTRY CODES.csv', 'data/Summer Olympic medals/Summer Olympic medalists 1896 to 2008 - EDITIONS.tsv', 'data/Summer Olympic medals/bronze_top5.csv']


((138, 3), (5, 2), (5, 2), (138, 3), (138, 3), (201, 3), (5, 2))

## 1.1. Reindexing dataframes

In [12]:
def print_dataframes_statistics(dataframes):
    '''
    Print the statistics of list of dataframes
    
    Parameter
    ---------
    dataframes: list
        list of Pandas dataframe
    
    Return
    ------
    Print output
    '''
    for dataframe in dataframes:
        print(dataframe.info())
        print(dataframe.shape)
        print(dataframe.head())
        print('----------------------------------------\n')

In [13]:
dfs = [gold, silver_top5, gold_top5, 
       bronze, silver, ioc_country_codes, bronze_top5]

print_dataframes_statistics(dfs)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138 entries, 0 to 137
Data columns (total 3 columns):
NOC        138 non-null object
Country    138 non-null object
Total      96 non-null float64
dtypes: float64(1), object(2)
memory usage: 3.3+ KB
None
(138, 3)
   NOC         Country   Total
0  USA   United States  2088.0
1  URS    Soviet Union   838.0
2  GBR  United Kingdom   498.0
3  FRA          France   378.0
4  GER         Germany   407.0
----------------------------------------

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
Country    5 non-null object
Total      5 non-null float64
dtypes: float64(1), object(1)
memory usage: 160.0+ bytes
None
(5, 2)
          Country   Total
0   United States  1195.0
1    Soviet Union   627.0
2  United Kingdom   591.0
3          France   461.0
4           Italy   394.0
----------------------------------------

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (t