# Introduction to Pandas library

In [0]:
import pandas as pd

## Series

### Creating Pandas series using `pd.Series`

In [0]:
s = pd.Series([2,4,5,6,9])
print(s)
print(type(s))

In [0]:
char_series = pd.Series(['a','b','af'])
print(char_series)

In [0]:
# creating a series of timestamps
date_series = pd.date_range(start = '11-09-2017', end='12-12-2017')
print(date_series)
print(type(date_series))

### Indexing Series

In [0]:
# accessing 4th element
print(s[3])

# accessing elements starting index = 2 till end
print(s[2:])

# accessing 2nd and 4th element from Series
print(s[[1,3]])

### Apply function

In [0]:
s.apply(lambda x: x**2)

In [0]:
# numpy array
import numpy as np

#np.arange(10).apply(lambda x: x**2) # Gives error
array_1 = np.arange(10)
fn = np.vectorize(lambda x: x**2)
print(fn(array_1))

## DataFrames

### Mounting Google drive locally

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

### Creating dataframes from Dictionaries

In [0]:
df = pd.DataFrame({'name':['Vinay','Kushal','Aman','Saif'],
                  'age':[22,25,24,28],
                  'occupation':['engineer','doctor','data analyst','teacher']})
df

### Working with dataframe from csv file

In [0]:
# read csv file as dataframe
market_df = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Datasets/global_sales_data/market_fact.csv')

In [0]:
# Looking at top entries of dataframes
market_df.head()

In [0]:
# Looking at bottom entries of dataframes
market_df.tail()

In [0]:
# Looking at the global info about dataframe
market_df.info()

In [0]:
# describe dataframe : Gives summary of all numeric columns in the dataset

market_df.describe()

In [0]:
# Column names of dataframe
market_df.columns

In [0]:
# The number of rows and columns
market_df.shape

In [0]:
# You can extract the values of dataframe as a numpy array using df.values
market_df.values

In [0]:
# setting index to "Ord_id"
df2 = market_df.set_index('Ord_id', inplace=False)
df2.head()
#market_df.head()

In [0]:
# Sort dataframes by index
market_df.sort_index(ascending = False)

In [0]:
# Sort data frames by values
market_df.sort_values(by='Sales').head()

In [0]:
# Sort in decreasing order of shipping cost
market_df.sort_values(by='Shipping_Cost', ascending=False).head()

In [0]:
# Sort by more than 2 columns

# Sort in ascending order of Sales for each Product
market_df.sort_values(by=['Prod_id','Sales'],ascending=False)

In [0]:
# Sort by more than 2 columns

# Sort in ascending order of Sales for each Product
market_df.sort_values(by=['Prod_id','Sales'],ascending=[False,True])

### Indexing and Selection in dataframe

In [0]:
# Selecting the rows from indices 2 to 6
market_df[2:7]

In [0]:
# Selecting alternate rows starting from index = 5
market_df[5::2].head()

In [0]:
# Select columns - using df['column']

sales = market_df['Sales']
print(sales.head())
print(type(sales))

In [0]:
# Select columns - using df.column
sales = market_df.Sales
print(sales.head())
print(type(sales))

In [0]:
# Select multiple columns
market_df[['Cust_id','Sales','Profit']].head()

In [0]:
print(type(market_df[['Cust_id','Sales','Profit']]))

In [0]:
# Similarly, using double square brackets, if we select one column, we'll get df and not Series
type(market_df[['Sales']])

market_df[['Sales']].head()

In [0]:
market_df[2] # gives error because pandas doesn't know if it is row index or column label or what not
# we use pd.DataFrame.iloc or pd.DataFrame.loc

In [0]:
# Selecting a single element
# Note that 2,4 corresponds to the third row and 5th column (Sales)
market_df.iloc[2,4]

In [0]:
# Selecting a single row and all columns 
# Select 6th row, with label (and index) = 5
print(market_df.iloc[5])

# equivalent to 
print(market_df.iloc[5,:])

# equivalent to 
print(market_df.iloc[5,])

In [0]:
# Selecting multiple rows using a list of indices
market_df.iloc[[5,7,8]]

# equivalent to market_df.iloc[[5,7,8],:]
# equivalent to market_df.iloc[[5,7,8],]

In [0]:
market_df.iloc[4:8]
# equivalent to market_df.iloc[4:8,:]
# equivalent to market_df.iloc[4:8,]

In [0]:
# Selecting a single column
market_df.iloc[:,2].head()

In [0]:
market_df.iloc[2:5,3:8]

In [0]:
# using booleans
# This selects the rows corresponding to True
market_df.iloc[[True, True, False, True]]

#### Label based indexing

In [0]:
print(market_df.loc[5]) # Row with label=5 and not index
print(market_df.loc[[3,7,8],'Sales']) # Rows with label=[3,7,8]

In [0]:
df2.loc['Ord_5406','Sales'] #'Ord_5406' => label or index name

In [0]:
df2.loc[['Ord_5406','Ord_5446','Ord_5485'],'Sales':'Profit']

In [0]:
# using booleans
market_df.loc[[True, True, False, True]]

### Slicing and dicing on DataFrame

In [0]:
df = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Datasets/global_sales_data/market_fact.csv')
df.head()

In [0]:
# Select all rows with Sales > 3000
df.Sales > 3000
df.loc[df.Sales > 3000].head()
# Equivalent to df.loc[df['Sales'] > 3000, :]

In [0]:
# We combine multiple conditions using the & operator
df.loc[(df.Sales > 2000) & (df.Sales < 3000) & (df.Profit > 100), :].head()

In [0]:
# The OR operator is '|'
df.loc[(df.Sales > 2000) | (df.Profit > 100), :].head()

In [0]:
df.loc[(df.Sales > 2000) & (df.Sales < 3000) & (df.Profit > 100),['Cust_id','Sales','Profit']].head()

In [0]:
# You may want to select rows whose column value is in an iterable

customers_in_bangalore = ['Cust_1798','Cust_1519','Cust_637','Cust_851']

# To get all the orders from these customers, use the isin() function
df.loc[df['Cust_id'].isin(customers_in_bangalore), :]

### Merging and Concatenation of DataFrames

#### Merging of DataFrames

In [0]:
market_df = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Datasets/global_sales_data/market_fact.csv')
customer_df = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Datasets/global_sales_data/cust_dimen.csv')
product_df = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Datasets/global_sales_data/prod_dimen.csv')
shipping_df = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Datasets/global_sales_data/shipping_dimen.csv')
orders_df = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Datasets/global_sales_data/orders_dimen.csv')

In [0]:
print(market_df.head())
print(customer_df.head())
print(product_df.head())
print(shipping_df.head())
print(orders_df.head())

In [0]:
# Merging the dataframes
# Note that Cust_id is the common column/key, which is provided to the 'on' argument
# how = 'inner' makes sure that only the customer ids present in both dfs are included in the result

df_1 = pd.merge(market_df, customer_df, how='inner', on='Cust_id')
df_1.head()

In [0]:
df_1.loc[df_1['Customer_Segment'] == 'CORPORATE', :].head()

In [0]:
# Select all orders from product category = office supplies and from the corporate segment
df_2 = pd.merge(df_1, product_df, how="inner", on="Prod_id")
df_2.head()
df_2.loc[(df_2['Product_Category']=='OFFICE SUPPLIES') 
         & (df_2['Customer_Segment'] == 'CORPORATE'), :].head()

In [0]:
# Merge with shipping and orders df
df_3 = pd.merge(df_2, shipping_df, how="inner", on="Ship_id")
df_3.head()

master_df = pd.merge(df_3, orders_df, how="inner", on="Ord_id")
print(master_df.shape)
master_df.head()

#### Concatenation of DataFrames

In [0]:
# DataFrames having same columns
df_1 = pd.DataFrame({'Name': ['Aman', 'Jai','Rashmi','Saif'],
                    'Age':['34','31','22','33'],
                    'Gender':['M','M','F','M']})
df_2 = pd.DataFrame({'Name': ['Akhil', 'Asha','Preeti'],
                    'Age':['31','22','23'],
                    'Gender':['M','F','F']})
print(df_1.head())
print(df_2.head())

In [0]:
# To concatenate them, one on top of the other, you can use pd.concat
# The first argument is a sequence (list) of dataframes
# axis = 0 indicates that we want to concat along the row axis
pd.concat([df_1, df_2], axis=0)

In [0]:
# An alternative to concat along the rows is the append() function
df_1.append(df_2)

### Grouping and Summarising

In [0]:
market_df = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Datasets/global_sales_data/market_fact.csv')
customer_df = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Datasets/global_sales_data/cust_dimen.csv')
product_df = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Datasets/global_sales_data/prod_dimen.csv')
shipping_df = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Datasets/global_sales_data/shipping_dimen.csv')
orders_df = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Datasets/global_sales_data/orders_dimen.csv')

df_1 = pd.merge(market_df, customer_df, how='inner', on='Cust_id')
df_2 = pd.merge(df_1, product_df, how="inner", on="Prod_id")
df_3 = pd.merge(df_2, shipping_df, how="inner", on="Ship_id")
master_df = pd.merge(df_3, orders_df, how="inner", on="Ord_id")
print(master_df.shape)
master_df.head()

In [0]:
# Which customer segments are the least profitable?

# Step1: Grouping: First, we will group the dataframe by customer segments
df_by_segment = master_df.groupby('Customer_Segment')
df_by_segment

In [0]:
# Step2: Applying a function
# We can choose aggregate functions such as sum, mean, median, etc.
df_by_segment['Profit'].sum()

# Equivalent to df_by_segment.Profit.sum()
df_by_segment.Profit.sum().sort_values(ascending=False)

In [0]:
pd.DataFrame(df_by_segment['Profit'].sum())

In [0]:
# Ex: Which product categories are the least profitable?

# 1. Group by product category
by_product_cat = master_df.groupby('Product_Category')

In [0]:
# 2. This time, let's compare average profits
# Apply mean() on profit
by_product_cat['Profit'].mean().sort_values()

In [0]:
# Which product categories and product subcategories are least profitable?
by_product_cat_subcat = master_df.groupby(['Product_Category','Product_Sub_Category'])
by_product_cat_subcat['Profit'].mean().sort_values()

In [0]:
by_product_cat['Profit'].describe()

In [0]:
# Calculate sales across each region as a perentange of total sales
100 * (master_df.groupby('Region').Sales.sum() / sum(master_df['Sales']))

### Lambda functions

In [0]:
# Create a column 'is_profitable' which has values on following logic: if Profit > 0 then 1 else 0

def is_positive(x):
  return x > 0

master_df['is_profitable'] = master_df['Profit'].apply(is_positive)
master_df.head()



In [0]:
# ALternative using lambda function
master_df['is_profitable'] = master_df['Profit'].apply(lambda x: x>0)
master_df.head()

In [0]:
# Comparing percentage of profit across customer segments
master_df.groupby('Customer_Segment').is_profitable.mean().sort_values()

In [0]:
# Comparing percentage of profit across product categories
master_df.groupby('Product_Category').is_profitable.mean().sort_values()

In [0]:
# Create a column 'profit_per_qty' using Profit/Order_Quantity columns
master_df['profit_per_qty'] = master_df['Profit'] / master_df['Order_Quantity']
master_df.head()

### Pivot tables

In [0]:
help(pd.DataFrame.pivot_table)

In [0]:
master_df.pivot_table(values='Sales',index='Customer_Segment',aggfunc='mean')

In [0]:
master_df.pivot_table(values='is_profitable',index='Region', aggfunc='sum')

In [0]:
# Grouping by both rows and columns
# Compare the total profit across product categories and customer segments
# since there are 2 categorical variables, we use both rows and columns

master_df.pivot_table(values='Profit',
                     index='Product_Category',
                     columns='Customer_Segment',
                     aggfunc='sum')

In [0]:
# Compute the mean of all numeric columns across Product categories

master_df.pivot_table(columns='Product_Category')