In [14]:
import pandas as pd
import glob, os
import re
import fnmatch
import requests
import zipfile
import io
from datetime import date

#@Param - downloadPath, Path to download raw data 
#@Param - targetPath, Path to write standardized data to be uploaded to S3 Bucket 

class FDIC():
    def __init__(self,downloadPath,targetPath):
        self.downloadTargetFolder = downloadPath
        self.targetPath = targetPath
    
    # Set variable for source parent URL and the download target folder
    sourceURL = 'https://www5.fdic.gov/sdi/Resource/AllReps/All_Reports_'

    # Set variables to be use for string manipulation and declare array to hold the final source URL's
    quarters = ['0331', '0630', '0930', '1231']
    startingYear = 2009
    urlList = []
    

    # Loop from starting year to current year and create a source URL for each quarter. Push URL's into final URL array.
    def createURLArray(year):
        while year < date.today().year:
            for q in quarters:
                urlList.append(sourceURL + str(year) + q + '.zip')
            year += 1


    # Declare function for downloading and unpacking the .zip files from source URL's to a specified target folder.
    def GetFileFromURL(url):
        try:
            print("Downloading: " + url)
            sourceZip = requests.get(url)
            zipContent = zipfile.ZipFile(io.BytesIO(sourceZip.content))
            print("Unpacking files for: " + url)
            zipContent.extractall(downloadTargetFolder)
        except:
            print("Zip file not found at the following URL: ".format(url))

    def GetBulkFiles():
        startYear = int(input("Enter a starting year for the data pull: "))
        createURLArray(startYear)
        for url in urlList:
            GetFileFromURL(url)


    def prepare_files(path):
        
        GetBulkFiles()
        
        path=path
        os.chdir(path)
        filelist=[]
        dict_={}
        startvalue=1

        unique_endings=[]
        for file in glob.glob("*_fs220*"):
            unique_endings.append(file.split("_")[-1].split(".")[0])
        unique_endings=set(unique_endings)


        last_year_list=[]
        last_month_list=[]
        for ending in unique_endings:
            last_year_reported='2000'
            last_month_reported=3
            for file in glob.glob("*"+ending+'.csv'):
                latest_year=file[0:4]
                latest_year_int=int(latest_year)
                if latest_year_int > int(last_year_reported):
                    last_year_reported = str(latest_year_int)
            last_year_list.append(last_year_reported)
            for file in glob.glob(last_year_reported+"*"+ending+'.csv'):
                    latest_month=file[4:6]
                    len_s=len(latest_month)
                    latest_month_int=int(latest_month)
                    if latest_month_int > int(last_month_reported):
                        last_month_reported = str(latest_month_int).rjust(len_s, "0")
                        last_month_list.append(last_month_reported)


        for ending,year,month in zip(unique_endings, last_year_list, last_month_list):
            #print(year,month,ending)
            latest_csv=pd.read_csv(path+year+month+'_'+ending+'.csv', encoding='latin1', dtype=object)#/most_recent+ending
            latest_csv.columns=map(str.upper, latest_csv.columns)
            latest_vars=latest_csv.columns.tolist()

            for file in glob.glob("*_"+ending+".csv"):#200903_fs220.csv
                filename = file[:-4]
                filelist.append(filename)
                dict_[filename] = pd.read_csv(file, encoding='latin1', dtype=object)
                dict_[filename].columns = map(str.upper, dict_[filename].columns)
                set1=dict_[filename].columns.tolist()
                set2=latest_vars
                to_add=[x for x in set2 if x not in set1]
                columns=dict_[filename].columns.tolist()
                dict_[filename]=dict_[filename].reindex(columns=[*dict_[filename].columns.tolist(), *to_add], fill_value='nan')
                dict_[filename]=dict_[filename].reindex_axis(columns+to_add, axis=1)
                dict_[filename].insert(0, 'ORDERS', range(startvalue, startvalue + len(dict_[filename])))
                startvalue+=len(dict_[filename])
                dict_[filename].insert(1, 'CU_NUM_DATE', dict_[filename]['CU_NUMBER'].astype(str)+'_'+pd.to_datetime(dict_[filename]['CYCLE_DATE']).dt.date.astype(str))
                dict_[filename].insert(2, 'QUARTER', filename[4:6])
                dict_[filename].insert(3, 'YEAR', filename[:-8])
        return dict_
    
    #Target Path to put standardized data
    def standardizeData():
        FDICdict_=prepare_files(path)
        #Writes
        for key in dict_.keys():
            FDICdict_[key].to_csv(targetPath+'/{}.csv'.format(key.split("_")[-1]+'-csv.'+key), index=False)
    
    

In [16]:
def dataType(val, current_type):
    try:
        # Evaluates numbers to an appropriate type, and strings an error
        t = ast.literal_eval(val)
    except ValueError:
        return 'varchar'
    except SyntaxError:
        return 'varchar'
    if type(t) in [int, float]:
        if type(t) is int and current_type not in ['varchar']:
            return 'decimal'
        else: 
            return'varchar'
    else:
        return 'varchar'

In [17]:
def createTable(recentFile, ending):
    
    f = open(recentFile, 'r')#.read().split('\n')
    reader = csv.reader(f)
    longest, headers, type_list = [], [], []
  #  headers = features
  #  for x in headers:
    #    longest.append(0)
     #   type_list.append('')
    
    firstLine = True
    for row in reader:
        if firstLine:
            firstLine = False
        if len(headers) == 0:
            headers = row
            for col in row:
                longest.append(0)
                type_list.append('')
        else:
            for i in range(len(row)):
            # NA is the csv null value
                if type_list[i] == 'varchar' or row[i] == 'NA':
                    pass
                else:
                    var_type = dataType(row[i], type_list[i])
                    #print(var_type)
                    
                    type_list[i] = var_type
            if len(row[i]) > longest[i]:
                longest[i] = len(row[i])
        #f.close()

    statement = 'create table {} ( \n'.format(ending)

    for i in range(len(headers)):
        if type_list[i] == 'varchar':
            statement = (statement + '{} varchar(255), \n').format(headers[i].upper())
            
        else:
            statement = (statement + '{} {}' + ', \n').format(headers[i].upper(), type_list[i])
    print(statement)
    return statement[:-3] + ');'
    

In [15]:
#Downloads and standardizes FDIC data
fdic = FDIC(downloadPath='',targetPath='')
fdic.standardizeData()