In [40]:
import pandas as pd
import glob, os
import re
import fnmatch

## This will prepare the data for the Redshift database. 
    1) Goes to the directory where all the files are stored
        a. Searches through all the files that contain fs220
        b. Append the endings of these files to a list [fs220A, fs220B, fs220C, etc...]
        c. Get the unique endings from that list by taking the set()
        d. Now we have all the unique endings (we will make a seperate table in our Redshift Database for each)
        
    2) For each ending, we find the most recent year and month of the reported data 
       This is needed to get the latest_vars variable
        a. First it finds most recent year reported for that ending (for loop with > comparisons)
        b. Then it finds most recent month reported in that year for that ending (for loop with > comparisons)
        c. It appends each year and month combination for each ending to lists
          (*NOTE: this is necessary as there is not a universal latest year and month for each ending. 
          For example, fs220E does not report in 201706, but instead, its last reporting is in 201606)
          
    3) Using the zip() function, we can iterate over three lists at a time (ending list, latest year list, latest month list) 
         a. We get the latest vars for that ending
         b. We then run through the main loop with that ending and complete the steps as before:
            i.    Reads in all fs220(ending) files 
            ii.   Makes all the column names uppercase
            iii.  Compares columns names to columns names of the latest variables using list comprehension
            iv.   If df is missing latest columns, includes them and fills with NaN, this is to maintain order - COPY into 
                  Redshift
            v.    Adds an ORDERS column which is a sequential index
            vi.   Adds a unique identifier with CU_NUM_DATE 
            vii.  Adds two columns, QUARTER, and YEAR, in case we may use these to query later on
            viii. Final structure is all df's(all years, months, and endings) with all the latest columns in a dictionary 
                  The df's are all "standardized"
                  If latest column doesn't exist in the original df, it is filled with NaN's. 
                  (*NOTE: we made need to change what it is filled in with here since NaN is a float and we really want   
                  NULL/None for Redshift Database to read correctly)
    
    4) Write all the dictionary files to a new folder to load into AWS S3 bucket 
       Add naming convention of same prefix for each file.
       Example: fs220A-csv. is prefix for all fs220A files (all years and months)

### Four Functions Below - Used to prepare files and write to folder to upload to S3 Bucket 

In [41]:
def setPath(path):
    #Setting the directory '
    os.chdir(path)
    return path
    
#Gets last year and month in each file 
def getYearMonth(unique_endings):
    last_year_list=[]
    last_month_list=[]
    for ending in unique_endings:
        last_year_reported='2000'
        last_month_reported=3
        for file in glob.glob("*"+ending+'.csv'):
            latest_year=file[0:4]
            latest_year_int=int(latest_year)
            if latest_year_int > int(last_year_reported):
                last_year_reported = str(latest_year_int)
        last_year_list.append(last_year_reported)
        for file in glob.glob(last_year_reported+"*"+ending+'.csv'):
                latest_month=file[4:6]
                len_s=len(latest_month)
                latest_month_int=int(latest_month)
                if latest_month_int > int(last_month_reported):
                    last_month_reported = str(latest_month_int).rjust(len_s, "0")
                    last_month_list.append(last_month_reported)
    return last_month_list, last_year_list

def write(dict_):
    for key in dict_.keys():
        dict_[key].to_csv('C:/Users/Owner/Documents/Practicum Projects/Arkatechture/NCUS_csv/fs_files_for_database_load2/{}.csv'.format(key.split("_")[-1]+'-csv.'+key), index=False)


In [42]:
def main():
    #Setting the directory 
    path=setPath("C:/Users/Owner/Documents/Practicum Projects/Arkatechture/NCUA_csv_2009_2017/")
    startvalue=1
    filelist=[]
    dict_={}

    #Searching for fs220 Files
    unique_endings=[]
    for file in glob.glob("*_fs220*"):
        unique_endings.append(file.split("_")[-1].split(".")[0])
    unique_endings=set(unique_endings)
    last_month_list, last_year_list = getYearMonth(unique_endings)


    #returns a list of features
    for ending,year,month in zip(unique_endings, last_year_list, last_month_list):
        #print(year,month,ending)
        latest_csv=pd.read_csv(path+year+month+'_'+ending+'.csv', encoding='latin1')#/most_recent+ending
        
        latest_csv.columns=map(str.upper, latest_csv.columns)
        latest_vars=latest_csv.columns.tolist()
        #Creates a dictinary (key:filename, value:dataframe) 
     
        for file in glob.glob("*_"+ending+".csv"):#200903_fs220.csv
            if not 'fs220B' in ending:
                continue
            
            filename = file[:-4]
            filelist.append(filename)
            dict_[filename] = pd.read_csv(file, encoding='latin1')
            dict_[filename].columns = map(str.upper, dict_[filename].columns)
            set1=dict_[filename].columns.tolist()
            set2=latest_vars
            to_add=[x for x in set2 if x not in set1]
            columns=dict_[filename].columns.tolist()
            dict_[filename]=dict_[filename].reindex(columns=[*dict_[filename].columns.tolist(), *to_add], fill_value=-1)
            dict_[filename]=dict_[filename].reindex_axis(columns+to_add, axis=1)
            dict_[filename].insert(0, 'ORDERS', range(startvalue, startvalue + len(dict_[filename])))
            startvalue+=len(dict_[filename])
            dict_[filename].insert(1, 'CU_NUM_DATE', dict_[filename]['CU_NUMBER'].astype(str)+'_'+pd.to_datetime(dict_[filename]['CYCLE_DATE']).dt.date.astype(str))
            dict_[filename].insert(2, 'QUARTER', filename[4:6])
            dict_[filename].insert(3, 'YEAR', filename[:4])
        write(dict_)


In [43]:
main()

KeyboardInterrupt: 

## Connect to Redshift via Python

In [None]:
#!pip --upgrade pip

Install the following

In [None]:
#!pip install ipython-sql

Set up connection credentials

In [2]:
import sqlalchemy
import psycopg2

In [4]:
from sqlalchemy import create_engine
import pandas as pd
#engine = create_engine('postgresql://arkauser:Password1@arkatestcluster.cpjywwj3yist.us-west-2.redshift.amazonaws.com:5439/awesomedatabase')
data_frame = pd.read_sql_query('SELECT ACCT_010 from fs220;', engine)

In [10]:
from pandas.io import sql
sql.execute('''create table fs220B ( 
ORDERS varchar(255), 
CU_NUM_DATE varchar(255), 
QUARTER varchar(255), 
YEAR varchar(255), 
CU_NUMBER varchar(255), 
CYCLE_DATE varchar(255), 
JOIN_NUMBER varchar(255), 
ACCT_606A decimal, 
ACCT_606B decimal, 
ACCT_607 decimal, 
ACCT_608 decimal, 
ACCT_609 decimal, 
ACCT_620 decimal, 
ACCT_621 decimal, 
ACCT_622 decimal, 
ACCT_623 decimal, 
ACCT_624 decimal, 
ACCT_625 decimal, 
ACCT_626 decimal, 
ACCT_627 decimal, 
ACCT_628 decimal, 
ACCT_630A decimal, 
ACCT_630B1 decimal, 
ACCT_630B2 decimal, 
ACCT_640 decimal, 
ACCT_652C decimal, 
ACCT_653A decimal, 
ACCT_653B1 decimal, 
ACCT_653B2 decimal, 
ACCT_653C decimal, 
ACCT_655C decimal, 
ACCT_656A decimal, 
ACCT_656B1 decimal, 
ACCT_656B2 decimal, 
ACCT_657A decimal, 
ACCT_657B1 decimal, 
ACCT_657B2 decimal, 
ACCT_659 decimal, 
ACCT_660 decimal, 
ACCT_661 decimal, 
ACCT_670A decimal, 
ACCT_670B decimal, 
ACCT_672C decimal, 
ACCT_698 decimal, 
ACCT_709 decimal, 
ACCT_725 decimal, 
ACCT_726 decimal, 
ACCT_731 decimal, 
ACCT_732 decimal, 
ACCT_733 decimal, 
ACCT_734 decimal, 
ACCT_735 decimal, 
ACCT_736 decimal, 
ACCT_737 decimal, 
ACCT_741C decimal, 
ACCT_742C decimal, 
ACCT_746C decimal, 
ACCT_747A decimal, 
ACCT_747B1 decimal, 
ACCT_747B2 decimal, 
ACCT_747C decimal, 
ACCT_750C decimal, 
ACCT_752 decimal, 
ACCT_753 decimal, 
ACCT_754 decimal, 
ACCT_756 decimal, 
ACCT_757 decimal, 
ACCT_758 decimal, 
ACCT_760A decimal, 
ACCT_760B1 decimal, 
ACCT_760B2 decimal, 
ACCT_761A decimal, 
ACCT_761B1 decimal, 
ACCT_761B2 decimal, 
ACCT_770 decimal, 
ACCT_772 decimal, 
ACCT_773 decimal, 
ACCT_774 decimal, 
ACCT_776 decimal, 
ACCT_777 decimal, 
ACCT_778 decimal, 
ACCT_780 decimal, 
ACCT_781 decimal, 
ACCT_795 decimal, 
ACCT_799 decimal, 
ACCT_811 decimal, 
ACCT_812 decimal, 
ACCT_813 decimal, 
ACCT_814 decimal, 
ACCT_815 decimal, 
ACCT_816 decimal, 
ACCT_817 decimal, 
ACCT_818 decimal, 
ACCT_820 decimal, 
ACCT_821 decimal, 
ACCT_850 decimal, 
ACCT_860A decimal, 
ACCT_860B1 decimal, 
ACCT_860B2 decimal, 
ACCT_865 decimal, 
ACCT_880A decimal, 
ACCT_880B1 decimal, 
ACCT_880B2 decimal, 
ACCT_900 decimal, 
ACCT_902A decimal, 
ACCT_902B1 decimal, 
ACCT_902B2 decimal, 
ACCT_906B1 decimal, 
ACCT_906B2 decimal, 
ACCT_906C decimal, 
ACCT_908B1 decimal, 
ACCT_908B2 decimal, 
ACCT_911A decimal, 
ACCT_911B1 decimal, 
ACCT_911B2 decimal, 
ACCT_913 decimal, 
ACCT_914 decimal, 
ACCT_955 decimal, 
ACCT_956 decimal, 
ACCT_957 decimal, 
ACCT_958 decimal, 
ACCT_961 decimal, 
ACCT_962 decimal, 
ACCT_963 decimal, 
ACCT_964 decimal, 
ACCT_965 decimal, 
ACCT_966 decimal, 
ACCT_970 decimal, 
ACCT_971 decimal, 
ACCT_972 decimal, 
ACCT_974 decimal, 
ACCT_977 decimal, 
ACCT_978 decimal, 
ACCT_982 decimal, 
ACCT_983 decimal, 
ACCT_984 decimal, 
ACCT_985 decimal, 
ACCT_986 decimal, 
ACCT_987 decimal, 
ACCT_988 decimal, 
ACCT_990 decimal, 
ACCT_992 decimal, 
ACCT_993 decimal, 
ACCT_994 decimal, 
ACCT_995 decimal, 
ACCT_603 decimal, 
ACCT_743C decimal, 
ACCT_744A decimal, 
ACCT_744B1 decimal, 
ACCT_744B2 decimal, 
ACCT_744C decimal, 
ACCT_085 decimal, 
ACCT_086 decimal, 
ACCT_087 decimal, 
ACCT_782 decimal, 
ACCT_783 decimal, 
ACCT_784 decimal, 
ACCT_796A decimal, 
ACCT_796B decimal, 
ACCT_796C decimal, 
ACCT_796D decimal, 
ACCT_796E decimal, 
ACCT_797A decimal, 
ACCT_797B decimal, 
ACCT_797C decimal, 
ACCT_797D decimal, 
ACCT_797E decimal, 
ACCT_799B decimal, 
ACCT_799C decimal, 
ACCT_799D decimal, 
ACCT_840 decimal, 
ACCT_875 decimal, 
ACCT_877 decimal, 
ACCT_945 decimal, 
ACCT_801 decimal, 
ACCT_614 decimal, 
ACCT_615 decimal, 
ACCT_745A decimal, 
ACCT_745B decimal, 
ACCT_745C decimal, 
ACCT_745D decimal, 
ACCT_745E decimal, 
ACCT_819 decimal, 
ACCT_867A decimal, 
ACCT_867B1 decimal, 
ACCT_867B2 decimal, 
ACCT_867C decimal, 
ACCT_925 decimal, 
ACCT_841 decimal, 
ACCT_465 decimal, 
ACCT_466 decimal, 
ACCT_767 decimal, 
ACCT_768 decimal, 
ACCT_918 decimal, 
ACCT_919 decimal, 
ACCT_026A decimal, 
ACCT_026B decimal, 
ACCT_027A decimal, 
ACCT_027B decimal, 
ACCT_028A decimal, 
ACCT_028B decimal, 
ACCT_045A decimal, 
ACCT_045B decimal, 
ACCT_585 decimal, 
ACCT_680 decimal, 
ACCT_681 decimal, 
ACCT_682 decimal, 
ACCT_769 decimal, 
ACCT_785 decimal, 
ACCT_786 decimal, 
ACCT_787 decimal, 
ACCT_892 decimal, 
ACCT_895 decimal, 
ACCT_896 decimal, 
ACCT_899 decimal, 
ACCT_020I decimal, 
ACCT_021I decimal, 
ACCT_022I decimal, 
ACCT_023I decimal, 
ACCT_041I decimal, 
ACCT_550I decimal, 
ACCT_551I decimal, 
ACCT_981 decimal, 
ACCT_981A decimal, 
ACCT_981B decimal, 
ACCT_065A1 decimal, 
ACCT_065A2 decimal, 
ACCT_065A3 decimal, 
ACCT_065A4 decimal, 
ACCT_065B1 decimal, 
ACCT_065C1 decimal, 
ACCT_065D1 decimal, 
ACCT_065E1 decimal, 
ACCT_067A1 decimal, 
ACCT_067A2 decimal, 
ACCT_067B1 decimal, 
ACCT_067C1 decimal, 
ACCT_068A decimal, 
ACCT_069A decimal, 
ACCT_033A decimal, 
ACCT_033B decimal, 
ACCT_033C decimal, 
ACCT_033D decimal, 
ACCT_033E decimal, 
ACCT_034A decimal, 
ACCT_034B decimal, 
ACCT_034C decimal, 
ACCT_034D decimal, 
ACCT_034E decimal, 
ACCT_035A decimal, 
ACCT_035B decimal, 
ACCT_035C decimal, 
ACCT_035D decimal, 
ACCT_035E decimal);''', engine)

<sqlalchemy.engine.result.ResultProxy at 0x252b80fbd68>

In [34]:
from pandas.io import sql
sql.execute("copy fs220B from 's3://arkatestbucket/load/fs220B-csv.' credentials 'aws_iam_role=arn:aws:iam::886264772629:role/Arkatest' csv IGNOREHEADER 1 null as '\\000';", engine)

InternalError: (psycopg2.InternalError) Load into table 'fs220b' failed.  Check 'stl_load_errors' system table for details.
 [SQL: "copy fs220B from 's3://arkatestbucket/load/fs220B-csv.' credentials 'aws_iam_role=arn:aws:iam::886264772629:role/Arkatest' csv IGNOREHEADER 1 null as '\\000';"]

In [20]:
test=sql.execute('''select d.query, substring(d.filename,14,20), 
d.line_number as line, 
substring(d.value,1,16) as value,
substring(le.err_reason,1,48) as err_reason
from stl_loaderror_detail d, stl_load_errors le
where d.query = le.query
and d.query = pg_last_copy_id(); ''', engine)

In [35]:
sql.execute('''SELECT err.userid,
       err.process,
       err.recordtime,
       err.pid,
       err.errcode,
       err.file,
       err.linenum,
       err.context,
       err.error
FROM stl_error err,
     stv_recents rec
WHERE rec.pid=err.pid
  AND rec.status='running'
  AND rec.query LIKE 'COPY%';''', engine)

ERROR:root:An unexpected error occurred while tokenizing input
The following traceback may be corrupted or invalid
The error message is: ('EOF in multi-line string', (1, 29))



TypeError: 'dict' object does not support indexing

In [None]:
SELECT err.userid,
       err.process,
       err.recordtime,
       err.pid,
       err.errcode,
       err.file,
       err.linenum,
       err.context,
       err.error
FROM stl_error err,
     stv_recents rec
WHERE rec.pid=err.pid
  AND rec.status='running'
  AND rec.query LIKE 'COPY%';

In [29]:
type(test)

sqlalchemy.engine.result.ResultProxy

In [33]:
test.first()

In [1]:
import sqlalchemy
import psycopg2
%reload_ext sql

connect_to_db = 'postgresql+psycopg2://' + \
                'arkauser' + ':' + 'Password1' + '@' + \
                'arkatestcluster.cpjywwj3yist.us-west-2.redshift.amazonaws.com' + ':' + '5439' + '/' + 'awesomedatabase';
%sql $connect_to_db
%config SqlMagic.displaylimit = 5


In [None]:
#%loan_ext sql

In [35]:
%%sql 
show table;

(psycopg2.ProgrammingError) syntax error at or near "table"
LINE 1: show table;
             ^
 [SQL: 'show table;']


In [6]:
%%sql 
drop table fs220B; 

ERROR:root:Cell magic `%%sql` not found.


SQL query

In [23]:
%%sql

create table fs220B ( 
ORDERS varchar(255), 
CU_NUM_DATE varchar(255), 
QUARTER varchar(255), 
YEAR varchar(255), 
CU_NUMBER varchar(255), 
CYCLE_DATE varchar(255), 
JOIN_NUMBER varchar(255), 
ACCT_606A decimal, 
ACCT_606B decimal, 
ACCT_607 decimal, 
ACCT_608 decimal, 
ACCT_609 decimal, 
ACCT_620 decimal, 
ACCT_621 decimal, 
ACCT_622 decimal, 
ACCT_623 decimal, 
ACCT_624 decimal, 
ACCT_625 decimal, 
ACCT_626 decimal, 
ACCT_627 decimal, 
ACCT_628 decimal, 
ACCT_630A decimal, 
ACCT_630B1 decimal, 
ACCT_630B2 decimal, 
ACCT_640 decimal, 
ACCT_652C decimal, 
ACCT_653A decimal, 
ACCT_653B1 decimal, 
ACCT_653B2 decimal, 
ACCT_653C decimal, 
ACCT_655C decimal, 
ACCT_656A decimal, 
ACCT_656B1 decimal, 
ACCT_656B2 decimal, 
ACCT_657A decimal, 
ACCT_657B1 decimal, 
ACCT_657B2 decimal, 
ACCT_659 decimal, 
ACCT_660 decimal, 
ACCT_661 decimal, 
ACCT_670A decimal, 
ACCT_670B decimal, 
ACCT_672C decimal, 
ACCT_698 decimal, 
ACCT_709 decimal, 
ACCT_725 decimal, 
ACCT_726 decimal, 
ACCT_731 decimal, 
ACCT_732 decimal, 
ACCT_733 decimal, 
ACCT_734 decimal, 
ACCT_735 decimal, 
ACCT_736 decimal, 
ACCT_737 decimal, 
ACCT_741C decimal, 
ACCT_742C decimal, 
ACCT_746C decimal, 
ACCT_747A decimal, 
ACCT_747B1 decimal, 
ACCT_747B2 decimal, 
ACCT_747C decimal, 
ACCT_750C decimal, 
ACCT_752 decimal, 
ACCT_753 decimal, 
ACCT_754 decimal, 
ACCT_756 decimal, 
ACCT_757 decimal, 
ACCT_758 decimal, 
ACCT_760A decimal, 
ACCT_760B1 decimal, 
ACCT_760B2 decimal, 
ACCT_761A decimal, 
ACCT_761B1 decimal, 
ACCT_761B2 decimal, 
ACCT_770 decimal, 
ACCT_772 decimal, 
ACCT_773 decimal, 
ACCT_774 decimal, 
ACCT_776 decimal, 
ACCT_777 decimal, 
ACCT_778 decimal, 
ACCT_780 decimal, 
ACCT_781 decimal, 
ACCT_795 decimal, 
ACCT_799 decimal, 
ACCT_811 decimal, 
ACCT_812 decimal, 
ACCT_813 decimal, 
ACCT_814 decimal, 
ACCT_815 decimal, 
ACCT_816 decimal, 
ACCT_817 decimal, 
ACCT_818 decimal, 
ACCT_820 decimal, 
ACCT_821 decimal, 
ACCT_850 decimal, 
ACCT_860A decimal, 
ACCT_860B1 decimal, 
ACCT_860B2 decimal, 
ACCT_865 decimal, 
ACCT_880A decimal, 
ACCT_880B1 decimal, 
ACCT_880B2 decimal, 
ACCT_900 decimal, 
ACCT_902A decimal, 
ACCT_902B1 decimal, 
ACCT_902B2 decimal, 
ACCT_906B1 decimal, 
ACCT_906B2 decimal, 
ACCT_906C decimal, 
ACCT_908B1 decimal, 
ACCT_908B2 decimal, 
ACCT_911A decimal, 
ACCT_911B1 decimal, 
ACCT_911B2 decimal, 
ACCT_913 decimal, 
ACCT_914 decimal, 
ACCT_955 decimal, 
ACCT_956 decimal, 
ACCT_957 decimal, 
ACCT_958 decimal, 
ACCT_961 decimal, 
ACCT_962 decimal, 
ACCT_963 decimal, 
ACCT_964 decimal, 
ACCT_965 decimal, 
ACCT_966 decimal, 
ACCT_970 decimal, 
ACCT_971 decimal, 
ACCT_972 decimal, 
ACCT_974 decimal, 
ACCT_977 decimal, 
ACCT_978 decimal, 
ACCT_982 decimal, 
ACCT_983 decimal, 
ACCT_984 decimal, 
ACCT_985 decimal, 
ACCT_986 decimal, 
ACCT_987 decimal, 
ACCT_988 decimal, 
ACCT_990 decimal, 
ACCT_992 decimal, 
ACCT_993 decimal, 
ACCT_994 decimal, 
ACCT_995 decimal, 
ACCT_603 decimal, 
ACCT_743C decimal, 
ACCT_744A decimal, 
ACCT_744B1 decimal, 
ACCT_744B2 decimal, 
ACCT_744C decimal, 
ACCT_085 decimal, 
ACCT_086 decimal, 
ACCT_087 decimal, 
ACCT_782 decimal, 
ACCT_783 decimal, 
ACCT_784 decimal, 
ACCT_796A decimal, 
ACCT_796B decimal, 
ACCT_796C decimal, 
ACCT_796D decimal, 
ACCT_796E decimal, 
ACCT_797A decimal, 
ACCT_797B decimal, 
ACCT_797C decimal, 
ACCT_797D decimal, 
ACCT_797E decimal, 
ACCT_799B decimal, 
ACCT_799C decimal, 
ACCT_799D decimal, 
ACCT_840 decimal, 
ACCT_875 decimal, 
ACCT_877 decimal, 
ACCT_945 decimal, 
ACCT_801 decimal, 
ACCT_614 decimal, 
ACCT_615 decimal, 
ACCT_745A decimal, 
ACCT_745B decimal, 
ACCT_745C decimal, 
ACCT_745D decimal, 
ACCT_745E decimal, 
ACCT_819 decimal, 
ACCT_867A decimal, 
ACCT_867B1 decimal, 
ACCT_867B2 decimal, 
ACCT_867C decimal, 
ACCT_925 decimal, 
ACCT_841 decimal, 
ACCT_465 decimal, 
ACCT_466 decimal, 
ACCT_767 decimal, 
ACCT_768 decimal, 
ACCT_918 decimal, 
ACCT_919 decimal, 
ACCT_026A decimal, 
ACCT_026B decimal, 
ACCT_027A decimal, 
ACCT_027B decimal, 
ACCT_028A decimal, 
ACCT_028B decimal, 
ACCT_045A decimal, 
ACCT_045B decimal, 
ACCT_585 decimal, 
ACCT_680 decimal, 
ACCT_681 decimal, 
ACCT_682 decimal, 
ACCT_769 decimal, 
ACCT_785 decimal, 
ACCT_786 decimal, 
ACCT_787 decimal, 
ACCT_892 decimal, 
ACCT_895 decimal, 
ACCT_896 decimal, 
ACCT_899 decimal, 
ACCT_020I decimal, 
ACCT_021I decimal, 
ACCT_022I decimal, 
ACCT_023I decimal, 
ACCT_041I decimal, 
ACCT_550I decimal, 
ACCT_551I decimal, 
ACCT_981 decimal, 
ACCT_981A decimal, 
ACCT_981B decimal, 
ACCT_065A1 decimal, 
ACCT_065A2 decimal, 
ACCT_065A3 decimal, 
ACCT_065A4 decimal, 
ACCT_065B1 decimal, 
ACCT_065C1 decimal, 
ACCT_065D1 decimal, 
ACCT_065E1 decimal, 
ACCT_067A1 decimal, 
ACCT_067A2 decimal, 
ACCT_067B1 decimal, 
ACCT_067C1 decimal, 
ACCT_068A decimal, 
ACCT_069A decimal, 
ACCT_033A decimal, 
ACCT_033B decimal, 
ACCT_033C decimal, 
ACCT_033D decimal, 
ACCT_033E decimal, 
ACCT_034A decimal, 
ACCT_034B decimal, 
ACCT_034C decimal, 
ACCT_034D decimal, 
ACCT_034E decimal, 
ACCT_035A decimal, 
ACCT_035B decimal, 
ACCT_035C decimal, 
ACCT_035D decimal, 
ACCT_035E decimal);

Done.


[]

In [43]:
%%sql

copy fs220B from 's3://arkatestbucket/load/fs220B-csv.' credentials 'aws_iam_role=arn:aws:iam::886264772629:role/Arkatest' csv IGNOREHEADER 1 null as '\000';



InternalError: (psycopg2.InternalError) Load into table 'fs220b' failed.  Check 'stl_load_errors' system table for details.
 [SQL: "copy fs220B from 's3://arkatestbucket/load/fs220B-csv.' credentials 'aws_iam_role=arn:aws:iam::886264772629:role/Arkatest' csv IGNOREHEADER 1 null as '\\000';"]

In [44]:
%%sql

select d.query, substring(d.filename,14,20), 
d.line_number as line, 
substring(d.value,1,16) as value,
substring(le.err_reason,1,48) as err_reason
from stl_loaderror_detail d, stl_load_errors le
where d.query = le.query
and d.query = pg_last_copy_id(); 

0 rows affected.


query,substring,line,value,err_reason


In [45]:
%%sql

create view loadview as
(select distinct tbl, trim(name) as table_name, query, starttime,
trim(filename) as input, line_number, colname, err_code,
trim(err_reason) as reason
from stl_load_errors sl, stv_tbl_perm sp
where sl.tbl = sp.id);

Done.


[]

In [47]:
%%sql

select table_name, query, line_number, colname, starttime, 
trim(reason) as error
from loadview
where table_name ='event'
order by line_number limit 1;

0 rows affected.


table_name,query,line_number,colname,starttime,error


In [None]:
#arn:aws:iam::886264772629:role/Arkatest

In [22]:
%%sql

select YEAR, QUARTER, CU_NUM_DATE, ACCT_010 from fs220;

233051 rows affected.


year,quarter,cu_num_date,acct_010
2009,6,5_2009-06-30,36356408
2009,6,12_2009-06-30,45288099
2009,6,16_2009-06-30,6974969
2009,6,22_2009-06-30,197614132
2009,6,28_2009-06-30,400954842


In [11]:
df = _.DataFrame()
df


Unnamed: 0,year,quarter,cu_num_date,acct_997
0,2009,03,5_2009-03-31,4186254
1,2009,03,12_2009-03-31,4313128
2,2009,03,16_2009-03-31,1206221
3,2009,03,22_2009-03-31,20810214
4,2009,03,28_2009-03-31,34761737
5,2009,03,37_2009-03-31,93513
6,2009,03,42_2009-03-31,5071208
7,2009,03,48_2009-03-31,10734444
8,2009,03,53_2009-03-31,8825219
9,2009,03,62_2009-03-31,3728928


In [7]:
df.head()

Unnamed: 0,year,quarter,cu_num_date,acct_997
0,20090,3,1_2009-03-31,987417
1,20090,3,6_2009-03-31,9495464
2,20090,3,13_2009-03-31,27142323
3,20090,3,19_2009-03-31,2715794
4,20090,3,26_2009-03-31,1775183


In [None]:
#to do fix the 2009 thing
#make the string print out well // pipe it 
#write up functions and documentation for them 
#make the one funciton so don't have to strip the stringin diffeent places every time

In [28]:
# import psycopg2
# import pandas as pd
# import numpy as np

# connect_to_db = 'postgresql+psycopg2://' + \
#                 'arkauser' + ':' + 'Password1' + '@' + \
#                 'arkatestcluster.cpjywwj3yist.us-west-2.redshift.amazonaws.com' + ':' + '5439' + '/' + 'awesomedatabase';

# query ='select CU_NUM_DATE, ACCT_997 from fs220A;'
# conn_string = connect_to_db
# conn = psycopg2.connect(conn_string)
# cursor = conn.cursor()
# cursor.execute(query)
# rows=pd.DataFrame(cursor.fetchall(),columns=['CuNumDate','997'])

# for row in rows:
#    print(row)

# conn.commit();
# conn.close();

NameError: name 'conn' is not defined

In [10]:
def readStatements(filename):
    file = open(filename,'r')
    content = []
    for l in file.readlines():
        content.append(l)
    return content

In [8]:
import os
os.getcwd()

'C:\\Users\\Owner\\Documents\\PRACTICUM PROJECTS\\Arkatechture'

In [11]:
myList = readStatements('copy.txt')

In [17]:
print(myList[2])

copy fs220 from's3://arkatestbucket/load/fs220-csv.' credentialsarn:aws:iam::886264772629:role/Arkatest csv IGNOREHEADER 1 null as '\000';

