# CSV 파일

+ jupyter에서는 코드만 작성. 실행은 VS Code를 통해서.

## CSV 파일 읽고 쓰기(파트1)

### CSV 모듈을 사용하지 않는 기본 파이썬 코드

In [None]:
#!/usr/bin/env python3
import sys

input_file = sys.argv[1]
output_file = sys.argv[2]

with open(input_file, 'r', newline = '') as filereader:
    with open(output_file, 'w', newline = '') as filewriter :
        header = filereader.readline()
        header = header.strip()
        header_list = header.split(',')
        print(header_list)
        
        # 구분자.join(map(변환하고자 하는 데이터 타입, 적용할 리스트))
        # : 리스트의 모든 요소들을 변환하고자 하는 데이터 타입으로 변환시켜 요소 사이마다 구분자를 넣어 하나로 연결.
        filewriter.write(','.join(map(str, header_list))+'\n')
        
        for row in filereader :
            row = row.strip()
            row_list = row.split()
            print(row_list)
            filewriter.write(','.join(map(str, row_list))+'\n')
        

### 팬더스

In [None]:
#!/usr/bin/env python3
import sys
import pandas as pd

input_file = sys.argv[1]
output_file = sys.argv[2]

data_frame = pd.read_csv(input_file)
print(data_frame)
data_frame.to_csv(output_file, index = False)

## CSV 파일 읽고 쓰기(파트2)

### 기본 파이썬(csv 모듈 사용)

In [None]:
#!/usr/bin/env python3
import sys
import csv

input_file = sys.argv[1]
output_file = sys.argv[2]

with open(input_file, 'r', newline = '') as csv_in_file :
    with open(output_file, 'w', newline = '') as csv_out_file :
        # csv.reader() / csv.writer()의 delimiter 옵션
        # : 행 구분자. 기본값은 쉼표(,) 이므로 쉼표일 경우 굳이 쓰지 않아도 된다.
        filereader = csv.reader(csv_in_file, delimiter = ',')
        filewriter = csv.writer(csv_out_file, delimiter = ',')
        for row in filereader : 
            filewriter.writerow(row)

## 특정 행을 필터링하기

+ 기본 코드 구조

In [1]:
for row in filereader :
    if 특정 조건 :
        이러이러한 일을 한다.
    else : 
        아니면 다른 일을 한다

SyntaxError: invalid syntax (<ipython-input-1-dce08934eefa>, line 2)

### 특정 조건을 충족하는 행의 필터링

+ 기본 파이썬

In [None]:
#!/usr/bin/env python3
import sys
import csv

input_file = sys.argv[1]
output_file = sys.argv[2]

with open(input_file, 'r', newline = '') as csv_in_file :
    with open(output_file, 'w', newline = '') as csv_out_file :
        filereader = csv.reader(csv_in_file)
        filewriter = csv.writer(csv_out_file)
        header = next(filereader)
        filewriter.writerow(header)
        
        for row_list in filereader :
            supplier = str(row_list[0]).strip()
            cost = str(row_list[3]).strip('$').replace(',', '')
            if supplier == 'Supplier Z' or float(cost) > 600.00 :
                filewriter.writerow(row_list)
                print(row_list)
        
        

+ 팬더스

In [None]:
#!/usr/bin/env python3
import sys
import pandas as pd

input_file = sys.argv[1]
output_file = sys.argv[2]

data_frame = pd.read_csv(input_file)

data_frame['Cost'] = data_frame['Cost'].str.strip('$').replace(',', '').astype(float)
data_frame_value_meets_conditon = data_frame.loc[(data_frame['Supplier Name'].str.contains('Z')) | (data_frame['Cost'] > 600.00), :]
data_frame_value_meets_condtion.to_csv(output_file)
print(data_frame_value_meets_condition)

### 특정 집합의 값을 포함하는 행의 필터링

+ 기본 파이썬

In [None]:
#!/usr/bin/env python3
import sys
import csv

input_file = sys.argv[1]
output_file = sys.argv[2]

important_dates = ['1/20/14', '1/30/14']

with open(input_file, 'r', newline = '') as csv_in_file:
    with open(output_file, 'w', newline = '') as csv_out_file:
        filereader = csv.reader(csv_in_file)
        filewriter = csv.writer(csv_out_file)
        header = next(filereader)
        filewriter.writerow(header)
        print(header)
        
        for row_list in filereader:
            a_date = row_list[4]
            if a_date in important_dates:
                filewriter.writerow(row_list)

+ 팬더스

In [None]:
#!/usr/bin/env python3
import sys
import pandas

input_file = sys.argv[1]
output_file = sys.argv[2]

important_dates = ['1/20/14', '1/31/14']

data_frame = pd.read_csv(input_file)
data_frame_value_in_set = data_frame.loc[data_frame['Purchase Date'].isin(important_dates)]
data_frame_value_in_set.to_csv(output_file)
print(data_frame_value_in_set)


### 패턴 / 정규 표현식을 활용한 필터링

+ 기본 파이썬

In [None]:
#!/usr/bin/env python3
import sys
import re
import csv

input_file = sys.argv[1]
output_file = sys.argv[2]

pattern = re.compile(r'(?P<my_pattern_group>^001-.*)', re.I)

with open(input_file, 'r', newline = '') as csv_in_file:
    with open(output_file, 'w', newline = '') as csv_out_file:
        filereader = csv.reader(csv_in_file)
        filewriter = csv.writer(csv_out_file)
        
        header = next(filereader)
        filewriter.writerow(header)
        print(header)

        for row in filereader :
            invoice_num = row[1]
            if pattern.search(invoice_num):
                filewriter.writerow(row)
                print(row)

+ 팬더스

In [None]:
#!/usr/bin/env python3
import pandas as pd
import sys

input_file = sys.argv[1]
output_file = sys.argv[2]

data_frame = pd.read_csv(input_file)
data_frame_value_matches_pattern = data_frame.loc[data_frame['Invoice Number'].str.startswith('001-'), :]
data_frame_value_matches_pattern.to_csv(output_file, index=False)
print(data_frame_value_matches_pattern)

## 특정 열 선택하기

### 열의 인덱스 값을 사용하여 특정 열을 선택하는 방법

+ 기본 파이썬

In [None]:
#!/usr/bin/env python3
import sys
import csv

input_file = sys.argv[1]
output_file = sys.argv[2]

important_col = [0, 3]

with open(input_file, 'r', newline = '') as csv_in_file:
    with open(output_file, 'w', newline = '') as csv_out_file:
        filereader = csv.reader(csv_in_file)
        filewriter = csv.writer(csv_out_file)
        for row_list in filereader:
            row_list_output = []
            for index_value in important_col:
                row_list_output.append(row_list[index_value])
            filewriter.writerow(row_list_output)
            print(row_list_output)

+ 팬더스

In [None]:
#!/usr/bin/env python3
import pandas as pd
import sys

input_file = sys.argv[1]
output_file = sys.argv[2]

important_col = [0, 3]

data_frame = pd.read_csv(input_file)
data_frame_column_by_index = data_frame.iloc[:, [0, 3]]
data_frame_column_by_index.to_csv(output_file, index = False)
print(data_frame_column_by_index)

### 열의 헤더를 사용하여 특정 열을 선택하는 방법

+ 기본 파이썬

In [1]:
#!/usr/bin/env python3
import csv
import sys

input_file = sys.argv[1]
output_file = sys.argv[2]

important_cols = ['Invoice Number', 'Purchase Date']
my_col_index = []

with open(input_file, 'r', newline = '') as csv_in_file:
    with open(output_file, 'w', newline = '') as csv_out_file:
        filereader = csv.reader(csv_in_file)
        filewriter = csv.writer(csv_out_file)
        header = next(filereader)
        
        for index_value in range(len(header)):
            if header[index_value] in important_cols:
                my_col_index.append(index_value)
        filewriter.writerow(important_cols)
        print(important_cols)
        
        for row_list in filereader:
            row_list_output = []
            for index_value in range(len(row_list)):
                if index_value in my_col_index:
                    row_list_output.append(row_list[index_value])
            filewriter.writerow(row_list_output)
            print(row_list_output)
                    

FileNotFoundError: [Errno 2] No such file or directory: '-f'

+ 팬더스

In [None]:
#!/usr/bin/env python3
import pandas as pd
import sys

input_file = sys.argv[1]
output_file = sys.argv[2]


data_frame = pd.read_csv(input_file)
data_frame_column_by_name = data_frame.loc[:, ['Invoice Number', 'Purchase Date']]
data_frame_column_by_name.to_csv(output_file, index = False)
print(data_frame_column_by_name)

## 연속 된 행 선택하기

### 기본 파이썬

In [None]:
#!/usr/bin/env python3
import csv
import sys

input_file = sys.argv[1]
output_file = sys.argv[2]

row_counter = 0
with open(input_file, 'r', newline = '') as csv_in_file:
    with open(output_file, 'w', newline = '') as csv_out_file:
        filereader = csv.reader(csv_in_file)
        filewriter = csv.writer(csv_out_file)
        for row in filereader:
            if row_counter >= 3 and row_counter<=15:
                filewriter.writerow(row)
                print(row)
            row_counter += 1

### 팬더스

In [None]:
#!/usr/bin/env python3
import pandas as pd
import sys

input_file = sys.argv[1]
output_file = sys.argv[2]

data_frame = pd.read_csv(input_file, header = None)

data_frame = data_frame.drop([0,1,2,16,17,18])
data_frame.columns = data_frame.iloc[0]
data_frame = data_frame.reindex(data_frame.index.drop(3))

data_frame.to_csv(output_file, index = False)
print(data_frame)

## 헤더 추가하기

### 기본 파이썬

In [None]:
#!/usr/bin/env python3
import csv
import sys

input_file = sys.argv[1]
output_file = sys.argv[2]

with open(input_file, 'r', newline = '') as csv_in_file :
    with open(output_file, 'w', newline = '') as csv_out_file :
        filereader = csv_reader(csv_in_file)
        filewriter = csv_writer(csv_out_file)
        header_list = ['Supplier Name', 'Invoice Number', 'Part Number','Cost','Purchase Date']
        filewriter.writerow(header_list)
        print(header_list)
        for row in filereader :
            filewriter.writerow(row)
            print(row)

### 팬더스

In [None]:
#!/usr/bin/env python3
import pandas as pd
import sys

input_file = sys.argv[1]
output_file = sys.argv[2]

header_list = ['Supplier Name', 'Invoice Number', 'Part Number','Cost','Purchase Date']

data_frame = pd.read_csv(input_file, header = None, names = header_list)
data_frame.to_csv(output_file)
print(data_frame)

## 여러 개의 CSV 파일 읽기

### 전체 파일 개수 및 각 파일의 행 및 열 개수 계산

In [None]:
#!/usr/bin/env python3
import sys
import glob
import csv
import os

input_path = sys.argv[1]

file_counter = 0

for input_file in glob.glob(os.path.join(input_path,'sales_*')):
    file_counter += 1
    with open(input_file, 'r', newline = '') as csv_in_file:
        filereader = csv.reader(csv_in_file)
        header = next(filereader)
        row_count = 1
        col_count = len(header)
        for row in filereader:
            row_count += 1
        print("{0!s} : \t{1:d} rows \t{2:d} columns \n".format(os.path.basename(input_file), row_count, col_count))

print("file_count : {}".format(file_counter))

## 여러 파일의 데이터 합치기

### 기본 파이썬

In [None]:
#!/usr/bin/env python3
import sys
import csv
import os
import glob

input_path = sys.argv[1]
output_file = sys.argv[2]

first_file = True
for input_file in glob.glob(os.path.join(input_path, 'sales_*')):
    with open(input_file, 'r', newline = '') as csv_in_file:
        with open(output_file, 'a', newline = '') as csv_out_file:
            filereader = csv.reader(csv_in_file)
            filewriter = csv.writer(csv_out_file)
            if first_file :
                for row in filereader :
                    filewriter.writerow(row)
                    print(row)
                    first_file = False
            else:
                header = next(filereader)
                for row in filereader :
                    filewriter.writerow(row)
                    print(row)

### 팬더스

In [None]:
#!/usr/bin/env python3
import sys
import pandas as pd
import glob
import os

input_path = sys.argv[1]
output_file = sys.argv[2]

all_df = []

for input_file in glob.glob(os.path.join(input_path,'sales_*')):
    data_frame = pd.read_csv(input_file)
    all_df.append(data_frame)
data_frame_concat = pd.concat(all_df, axis=0, ignore_index = True)
print(data_frame_concat)

## 파일에서 데이터 값의 합계 및 평균 계산하기

### 기본 파이썬

In [None]:
#!/usr/bin/env python3
import sys
import csv
import glob
import os

input_path = sys.argv[1]
output_file = sys.argv[2]

output_header_list = ['file_name', 'total', 'mean']

csv_out_file = open(output_file, 'w', newline = '')
filewriter = csv.writer(csv_out_file)
filewriter.writerow(output_header_list)

for input_file in glob.glob(os.path.join(input_path, 'sales_*')):
    with open(input_file, 'r', newline = '') as csv_in_file:
        filereader = csv.reader(csv_in_file)
        output_list = []
        output_list.append(os.path.basename(input_file))
        header = next(filereader)
        total_sales = 0.0
        number_of_sales = 0.0
        for row in filereader :
            sale_amount = row[3]
            total_sales += float(str(sale_amount).strip('$').replace(',', ''))
            number_of_sales += 1.0
        avg_sales = total_sales / number_of_sales
        output_list.append(total_sales)
        output_list.append(avg_sales)
        filewriter.writerow(output_list)
        print(output_list)
csv_out_file.close()

### 팬더스

In [None]:
#!/usr/bin/env python3
import pandas as pd
import sys
import glob
import os

input_path = sys.argv[1]
output_file = sys.argv[2]

all_files = glob.glob(os)
all_df = []

for input_file in glob.glob(os.path.join(input_path,'sales_*')):
    df = pd.read_csv(input_file, header = None, index_col = None)
    sales_amount = 