## Cleaning data from CSV file and inserting it it into IBC

#### Read Data from CSV

In [None]:
import pandas as pd
import numpy as np
from pycelonis import get_celonis
df = pd.read_csv('SalesJan2009.csv')
df.head(10)

#### Check if the datatypes of the columns are as expected

In [14]:
df.dtypes

Transaction_date     object
Product              object
Price                object
Payment_Type         object
Name                 object
City                 object
State                object
Country              object
Account_Created      object
Last_Login           object
Latitude            float64
Longitude           float64
dtype: object

#### Change the datatypes if required
Changing Transaction_date to datetime Price to integer

In [15]:
df["Transaction_date"] = df["Transaction_date"].apply(pd.to_datetime)
df['Price'] = df.apply(lambda x: int(x['Price'].replace(',', '')),axis=1)

#### Drop uneeded columns

droping Product Column

In [16]:
df.drop(["Product"], inplace=True, axis=1)
df.head()

Unnamed: 0,Transaction_date,Price,Payment_Type,Name,City,State,Country,Account_Created,Last_Login,Latitude,Longitude
0,2009-01-02 06:17:00,1200,Mastercard,carolina,Basildon,England,United Kingdom,1/2/09 6:00,1/2/09 6:08,51.5,-1.116667
1,2009-01-02 04:53:00,1200,Visa,Betina,Parkville,MO,United States,1/2/09 4:42,1/2/09 7:49,39.195,-94.68194
2,2009-01-02 13:08:00,1200,Mastercard,Federica e Andrea,Astoria,OR,United States,1/1/09 16:21,1/3/09 12:32,46.18806,-123.83
3,2009-01-03 14:44:00,1200,Visa,Gouya,Echuca,Victoria,Australia,9/25/05 21:13,1/3/09 14:22,-36.133333,144.75
4,2009-01-04 12:56:00,3600,Visa,Gerd W,Cahaba Heights,AL,United States,11/15/08 15:47,1/4/09 12:45,33.52056,-86.8025


#### Filling NaN values in Price column with the mean of the column

In [17]:
df['Price'] = df['Price'].fillna(df['Price'].mean())

#### Dropping all the columns with more than 90% Null values

In [6]:
df.dropna(thresh=int(df.shape[0] * .9), axis=1)

Unnamed: 0,Transaction_date,Price,Payment_Type,Name,City,State,Country,Account_Created,Last_Login,Latitude,Longitude
0,2009-01-02 06:17:00,1200,Mastercard,carolina,Basildon,England,United Kingdom,1/2/09 6:00,1/2/09 6:08,51.500000,-1.116667
1,2009-01-02 04:53:00,1200,Visa,Betina,Parkville,MO,United States,1/2/09 4:42,1/2/09 7:49,39.195000,-94.681940
2,2009-01-02 13:08:00,1200,Mastercard,Federica e Andrea,Astoria,OR,United States,1/1/09 16:21,1/3/09 12:32,46.188060,-123.830000
3,2009-01-03 14:44:00,1200,Visa,Gouya,Echuca,Victoria,Australia,9/25/05 21:13,1/3/09 14:22,-36.133333,144.750000
4,2009-01-04 12:56:00,3600,Visa,Gerd W,Cahaba Heights,AL,United States,11/15/08 15:47,1/4/09 12:45,33.520560,-86.802500
5,2009-01-04 13:19:00,1200,Visa,LAURENCE,Mickleton,NJ,United States,9/24/08 15:19,1/4/09 13:04,39.790000,-75.238060
6,2009-01-04 20:11:00,1200,Mastercard,Fleur,Peoria,IL,United States,1/3/09 9:38,1/4/09 19:45,40.693610,-89.588890
7,2009-01-02 20:09:00,1200,Mastercard,adam,Martin,TN,United States,1/2/09 17:43,1/4/09 20:01,36.343330,-88.850280
8,2009-01-04 13:17:00,1200,Mastercard,Renee Elisabeth,Tel Aviv,Tel Aviv,Israel,1/4/09 13:03,1/4/09 22:10,32.066667,34.766667
9,2009-01-04 14:11:00,1200,Visa,Aidan,Chatou,Ile-de-France,France,6/3/08 4:22,1/5/09 1:17,48.883333,2.150000


#### Create a new categorical column saying premium if transaction price is greater than 5000 and Basic otherwise

In [12]:
df['Sale Category'] = np.where(df["Price"] > 5000, 'Premium', 'Basic')
df.head()

Unnamed: 0,Transaction_date,Product,Price,Payment_Type,Name,City,State,Country,Account_Created,Last_Login,Latitude,Longitude,Sale Category
0,2009-01-02 06:17:00,Product1,1200,Mastercard,carolina,Basildon,England,United Kingdom,1/2/09 6:00,1/2/09 6:08,51.5,-1.116667,Basic
1,2009-01-02 04:53:00,Product1,1200,Visa,Betina,Parkville,MO,United States,1/2/09 4:42,1/2/09 7:49,39.195,-94.68194,Basic
2,2009-01-02 13:08:00,Product1,1200,Mastercard,Federica e Andrea,Astoria,OR,United States,1/1/09 16:21,1/3/09 12:32,46.18806,-123.83,Basic
3,2009-01-03 14:44:00,Product1,1200,Visa,Gouya,Echuca,Victoria,Australia,9/25/05 21:13,1/3/09 14:22,-36.133333,144.75,Basic
4,2009-01-04 12:56:00,Product2,3600,Visa,Gerd W,Cahaba Heights,AL,United States,11/15/08 15:47,1/4/09 12:45,33.52056,-86.8025,Basic


#### Push the data back into IBC in a data pool

In [None]:
data_pool = celonis.pools.find("enter your data pool id")
data_pool.push_table(df,"Name of Output Table in data pool", if_exists = 'replace')