## {{cookiecutter.project_name}}

{{cookiecutter.description}}

### Data Sources
- file1 : Description of where this file came from

### Changes
- {% now 'utc', '%m-%d-%Y' %} : Started project

In [None]:
import pandas as pd
from pathlib import Path
from datetime import datetime
import glob

In [None]:
# Set pandas display options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.float_format = '{:,.2f}'.format # Display as df as float with 2 decimals

### (1) Set file Locations

In [None]:
today = datetime.date.today()

in_file = ''
out_file = ''

### (2) Read files

#### (i) Read a CSV file

In [None]:
df = pd.read_csv(in_file)

#### (ii) Read an Excel file

In [None]:
sheet1 = '' # Sheet name in the excel file
sheet2 = '' # Sheet name in the excel file

with pd.ExcelFile(in_file) as reader:
    df1 = pd.read_excel(reader, sheet_name=sheet1, header=None)
    df2 = pd.read_excel(reader, sheet_name=sheet2, header=None)

### (3) Column Cleanup

- Get names & index of each column
- Drop/keep certain columns
- Add/insert new columns
- Remove all leading and trailing spaces
- Rename the columns for consistency
- Change order of columns

In [None]:
# Get names & index of each column
# As a LIST: build up a reference list of all columns and their index
cols = df.columns.tolist() 
#or
col_mapping = [f"{c[0]}:{c[1]}" for c in enumerate(df.columns)]

# As a DICT:  To rename columns, create a dictionary view of the data
col_mapping_dict = {c[0]:c[1] for c in enumerate(df.columns)}

In [None]:
# Insert a dataframe column at a specific location
df.insert(6, "new_col", np.nan) # 6 is the loc number starting with 0
df.insert(loc, column, value)

In [None]:
# Change order of columns
cols = df.columns.tolist()
cols = ['col3', 'col2', 'col1']
df = df[cols]

In [None]:
# Drop specific columns
df = df.drop(['col1', 'col2'], axis=1) 
df = df.iloc[:, :df.columns.get_loc('col') + 1] # Keep all columsn until 'col' / Drop columns to the right of 'col'

In [None]:
# Remove all leading and trailing spaces
# https://stackoverflow.com/questions/30763351/removing-space-in-dataframe-python
df.columns = [x.strip() for x in df.columns]

In [None]:
# Rename cols using a dictionary. Not all the columns have to be renamed.
cols_to_rename = {'oldName1': 'newName1', 'oldName2': 'newName2'}
df.rename(columns=cols_to_rename, inplace=True)

In [None]:
# Reindex your dataframe
df = df.reindex(columns=['col1', 'col3', 'col2'])

In [None]:
# Merge 2 of more df's
# https://stackoverflow.com/questions/53645882/pandas-merging-101
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html
df = df.merge(df_exrate, how='left', on='version')

In [None]:
# Concat df's
# https://stackoverflow.com/questions/49620538/what-are-the-levels-keys-and-names-arguments-for-in-pandas-concat-functio

### (4) Clean Up Data Types

- datetime column
- numbers: int or float
- boolean: yes/no, true/false
- categorical: as object
- others: object

In [None]:
# Change cols to int64 or 32
cols_to_int = ['ID_PRODUCT']
df[cols_to_int] = df[cols_to_int].astype('int64')

In [None]:
# For changing multiple column types at once
cols_to_category = ['BUYER_COUNTRY', 'SHIPMENT_TYPE', 'TRANSPORT_MODE']
df[cols_to_category] = df[cols_to_category].astype('category')

In [None]:
# to change a single column
df['col'] = pd.to_datetime(df['col'], format='%Y-%m-%d %H:%M:%S.%f')
# or to change multiple columns into the same datetime format
cols_to_date = ['col1', 'col2']
df[cols_to_date1] = df[cols_to_date1].apply(pd.to_datetime, format='%Y-%m-%d %H:%M:%S.%f')

In [None]:
df['col'] = df['col'].apply(pd.to_numeric)

In [None]:
df["col"] = df["col"].astype('category')

In [None]:
# Get unique value in each column
df['col'].unique()
print(df.apply(lambda col: col.unique()))

In [None]:
df.dtypes

### (5) Data Manipulation

- Check is any null values
- Drop that Nan column / rows
- Fill that NaN column with some values

Articles:
- https://pbpython.com/categorical-encoding.html

In [None]:
# Get rows that have NaN values
print(df[df.isnull().any(axis=1)])

In [None]:
# For a specific column, find all the values and frequency of each value
print(df['col'].value_counts())

In [None]:
# Use the above value count to fill in the new col X row missing data
df = df.fillna({"col": "value"})
# To replace NaN for the whole dataframe with 0
df = df.replace(np.nan, 0)

In [None]:
# Reset the index
df = df.reset_index(drop=True)

### (6) Save output file into processed directory

Save a file in the processed directory that is cleaned properly. It will be read in and used later for further analysis.

In [None]:
df.to_pickle(summary_file)