### Exploring a DataFrame
* df.info()
* df.describe()
* df.head()
* df.shape // attribute not a method
* df.values: numpy array 
* df.columns
* df.index

In [None]:
# let's consider the following data sets
homelessness_url = "https://assets.datacamp.com/production/repositories/5386/datasets/1a0ab2e8557930ec06473c16521874e516a216ae/homelessness.csv"
avocado_dataset = "https://assets.datacamp.com/production/repositories/5386/datasets/5528f46cc712c9083a6881f787fc9b34ab53d5ea/avoplotto.pkl"
temperature = "https://assets.datacamp.com/production/repositories/5386/datasets/47f5fde162bae3549ca7d5c26fb4c4639f100f28/temperatures.csv"
walmart_sales = "https://assets.datacamp.com/production/repositories/5386/datasets/5110afec30fc30bc5f3cf67b188d1513c3d6d940/sales_subset.csv"

In [None]:
import pandas as pd
import numpy as np

In [None]:
homeless_df = pd.read_csv(homelessness_url)

### sorting
we can use the df.sort_values(column_name): sorts the dataframe by the column values. Check the [documentation](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sort_values.html)

In [None]:
print(homeless_df.head())
# homeless_df = homeless_df.iloc[:, 1:]
# print(homeless_df.head())


In [None]:
homeless_df = homeless_df.iloc[:, 1:]


In [None]:
print(homeless_df.head())

In [None]:
# we can select multiple columns
print(homeless_df[["state", "state_pop"]]) 
# this is equivalent to
print(homeless_df.loc[:, ["state", "state_pop"]])


In [None]:
# let's consider statistics. Pandas has functions for both summary and accumulative statistics
# pandas had built-in functions such as median, mean, max, min and a large number of routine statistical procedures
# sometime we want more, right ?
def iqr(col):
    return col.quantile(0.75) - col.quantile(0.25)

sales = pd.read_csv(walmart_sales).iloc[:, 1:]
print(sales.head())

In [None]:
print(sales['weekly_sales'].agg(iqr))
print(sales.drop(['is_holiday', 'type', 'date'], axis=1).apply(iqr, axis=0)) # apply the custom functions tovery column in the data frame exceppt the non numerical ones 
print(sales.drop(['is_holiday', 'type', 'date'], axis=1).agg([np.mean, np.median])) # agg is a great choice as well

## Aggregation
there are different ways to aggregate a pandas table. 

In [None]:
## the old way
print(sales.head())
print(sales.groupby(['type'])['weekly_sales'].agg(np.mean))
print("#" * 100)
print(sales.groupby(['type']))

In [None]:
# another great way to aggregate is to use the pivot table thingy
print(sales.pivot_table(values="weekly_sales", index="department", columns="type", fill_value=0.0, margins=True))

In [None]:
# counting might be tricky when done manually. Pandas offers a number of built-in functions to perform this type of tasks

## dropping duplicated
# non_double = df.drop_duplicates(subset=["column to consider for uniqueness"]) 

In [None]:
# indices are also a tricky topic as things might get twircky really easily

print(sales.head(10))

In [None]:
avocados = pd.read_pickle(avocado_dataset)

# Look at the first few rows of data
print(avocados.head())

In [None]:
# Import matplotlib.pyplot with alias plt
import matplotlib.pyplot as plt

# Get the total number of avocados sold of each size
nb_sold_by_size = avocados.groupby("size")['nb_sold'].agg(sum)

# Create a bar plot of the number of avocados sold by size
nb_sold_by_size.plot(kind='bar')

# Show the plot
plt.show()

In [None]:
# Get the total number of avocados sold on each date
nb_sold_by_date = avocados.groupby('date')['nb_sold'].agg(sum)

# Create a line plot of the number of avocados sold by date
nb_sold_by_date.plot(kind='line', x='date', y='nb_sold', rot=45)

# Show the plot
plt.show()

In [None]:
# Scatter plot of avg_price vs. nb_sold with title
avocados.plot(kind='scatter', x='nb_sold', y='avg_price', title="Number of avocados sold vs. average price")
# Show the plot
plt.show()

In [None]:
avocados[avocados["type"] == "conventional"]["avg_price"].hist(alpha=0.5, bins=20)

avocados[avocados["type"] == "organic"]["avg_price"].hist(alpha=0.5, bins=20)

# Add a legend
plt.legend(["conventional", "organic"])

# Show the plot
plt.show()

In [None]:
print(avocados[avocados["type"] == "conventional"]["avg_price"])

## Filtering joins:
Filtering joins are special kinds of join that are not natively supported by Pandas. Yet, can be generated by additional manipulation.
### Semi joins
Semi joins are sementically quite similar to inner joins. Yet with two main differences:
* returns only the columns from the left table
* no duplicates even with one to many relationship
### anti-joins
Anti joins returns:
* the left table, excluding the intersection(inner join)
* returns only the columns from the left table.

In [None]:
## concatenating dataframes
df1, df2, df3
pd.concatenate([df1, df2, df3], sort=True, ignore_index=True, join='inner', keys=[k1, k2, k3]) 

# sort the column by names, 
# sets a uniform index from 0 to n-1,
# use only columns common in the passed dataframes
# keys, make the index composite, cannot use with ignore_index 

In [None]:
# the pandas merge function offers additional functionality.
pd.merge(df1, df2, validate='one_to_one') # the keyword argument validate can be set to ['ono_to_one','one_to_many'...]
# if the dataframes do not follow this relation, then an error is raise
# a similar function is provided in the concatenate function

pd.concatenate([dfs], verify_integrity=True) # this will raise an error if the resulting dataframe contains duplicate indices

In [None]:
# we can use merge_ordered when the order of the rows matter.
sp500 = "https://assets.datacamp.com/production/repositories/5486/datasets/6666955f71f936ab5fc3b0ee1eb595e19c126c01/S&P500.csv"
sp = pd.read_csv(sp500)
world_bank = "https://assets.datacamp.com/production/repositories/5486/datasets/6ef405912a3801f3ae59d2dd57573f80d598c1fb/WorldBank_GDP.csv"
gdp = pd.read_csv(world_bank)
print(sp.head())
print(gdp.head())

In [None]:
pd.merge_asof(df1, df2) # functions almost exactly as the usual merge but with a slight twick to it.
# it will match to the closest value in the right depeding on the direction.
pd.merge_asof(df1, df2, direction='forward') # match with the closest value that is greater or equal
pd.merge_asof(df1, df2, direction='backward') # math with the closest value that is smaller or equal

### Wide and Long formats: pd.melt()
Tabular data can generally be stored in 2 different ways:
1. The common, ***WIDE*** format:   
    * each column represents a feature, qualilty, aspect of the subject in question
    * each row represents an instance of the subject in question.
2. The ***LONG*** format:
    * certain columns would represents features while the other are denoted as ***$variable_i$*** adjacent to another column denoted by ***$value_i$***. An instance of a subject can appear in different rows.

Even though the wide format might be more understandable by humans, the Long format is easier to work with for computers.