In [None]:
import numpy as np
import pandas as pd

### Object Creation

In [None]:
# Creating a Series by passing a list of values, letting pandas create a default integer index:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

# Creating a DataFrame by passing a NumPy array, with a datetime index using date_range() and labeled columns:
dates = pd.date_range("20130101", periods=6)
dates
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df

# Creating a DataFrame by passing a dictionary of objects that can be converted into a series-like structure:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)
df2

# The columns of the resulting DataFrame have different dtypes:
df2.dtypes

### Viewing Data

In [None]:
# Use DataFrame.head() and DataFrame.tail() to view the top and bottom rows of the frame respectively:
df.head()
df.tail(3)

# Display the DataFrame.index or DataFrame.columns:
df.index
df.columns

# convert to numpy
df.to_numpy()

# For df2, the DataFrame with multiple dtypes, DataFrame.to_numpy() is relatively expensive:
df2.to_numpy()

# describe() shows a quick statistic summary of your data:
df.describe()

# Transposing your data:
df.T

# DataFrame.sort_index() sorts by an axis:
df.sort_index(axis=1, ascending=False)

# DataFrame.sort_values() sorts by values:
df.sort_values(by="B")

### Selection by Position

In [None]:
# Select via the position of the passed integers:
df.iloc[3]

# By integer slices, acting similar to NumPy/Python:
df.iloc[3:5, 0:2]

# By lists of integer position locations, similar to the NumPy/Python style:
df.iloc[[1, 2, 4], [0, 2]]

#For slicing rows explicitly:
df.iloc[1:3, :]

#For slicing columns explicitly:
df.iloc[:, 1:3]

#For getting a value explicitly:
df.iloc[1, 1]

#For getting fast access to a scalar (equivalent to the prior method):
df.iat[1, 1]


### Boolean Indexing

In [None]:
#Using a single column's values to select data:
df[df["A"] > 0]

#Selecting values from a DataFrame where a boolean condition is met:
df[df > 0]

#Using the ~Series.isin method for filtering:
df2 = df.copy()
df2["E"] = ["one", "one", "two", "three", "four", "three"]
df2
df2[df2["E"].isin(["two", "four"])]

### Setting

In [None]:

#Setting a new column automatically aligns the data by the indexes:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range("20130102", periods=6))
s1
df["F"] = s1

#Setting values by label
df.at[dates[0], "A"] = 0


#Setting values by position
df.iat[0, 1] = 0


#Setting by assigning with a NumPy array
df.loc[:, "D"] = np.array([5] * len(df))

#The result of the prior setting operations
df
df2 = df.copy()
df2[df2 > 0] = -df2
df2


### Missing Data

In [None]:
#Reindexing allows you to change/add/delete the index on a specified axis. This returns a copy of the data

df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"])
df1.loc[dates[0] : dates[1], "E"] = 1
df1

# DataFrame.dropna drops any rows that have missing data:
df1.dropna(how="any")

# DataFrame.fillna fills missing data:
df1.fillna(value=5)

# isna gets the boolean mask where values are nan
pd.isna(df1)



### Stats

In [None]:
#Performing a descriptive statistic:
df.mean()

#Same operation on the other axis:
df.mean(1)

#Operating with objects that have different dimensionality and need alignment.
#In addition, pandas automatically broadcasts along the specified dimension:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)
s
df.sub(s, axis="index")

### Apply

In [None]:
#DataFrame.apply applies a user defined function to the data:
df.apply(np.cumsum)
df.apply(lambda x: x.max() - x.min())

### Histogramming

In [None]:
# See more at Histogramming and Discretization

s = pd.Series(np.random.randint(0, 7, size=10))
s
s.value_counts()

### String Methods

In [None]:
s = pd.Series(["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"])
s.str.lower()

### Merge - Concat

In [None]:

# Concatenating pandas objects together along an axis with concat
df = pd.DataFrame(np.random.randn(10, 4))
df

# break it into pieces
pieces = [df[:3], df[3:7], df[7:]]

pd.concat(pieces)


### Merge - Join

In [None]:
# merge enables SQL style join types along specific columns
left = pd.DataFrame({"key": ["foo", "foo"], "lval": [1, 2]})
right = pd.DataFrame({"key": ["foo", "foo"], "rval": [4, 5]})
left
right
pd.merge(left, right, on="key")

# Another example that can be given is:

left = pd.DataFrame({"key": ["foo", "bar"], "lval": [1, 2]})
right = pd.DataFrame({"key": ["foo", "bar"], "rval": [4, 5]})
left
right
pd.merge(left, right, on="key")

### Grouping

In [None]:
# By "group by" we are referring to a process involving one or more of the following steps:

# Splitting the data into groups based on some criteria
# Applying a function to each group independently
# Combining the results into a data structure

df = pd.DataFrame(
    {
        "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
        "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
        "C": np.random.randn(8),
        "D": np.random.randn(8),
    }
)
df

# Grouping and then applying the pandas.core.groupby.DataFrameGroupBy.sum function to the resulting groups

df.groupby("A")[["C", "D"]].sum()

# Grouping by multiple columns forms a hierarchical index, and again we can apply the pandas.core.groupby.DataFrameGroupBy.sum function:
df.groupby(["A", "B"]).sum()

### Reshaping - Stacking

In [None]:
tuples = list(
    zip(
        ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
        ["one", "two", "one", "two", "one", "two", "one", "two"],
    )
)
index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"])
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=["A", "B"])
df2 = df[:4]
df2

# DataFrame.stack method "compresses" a level in the DataFrame's columns:

stacked = df2.stack()
stacked


# With a "stacked" DataFrame or Series (having a MultiIndex as the index), the inverse operation of DataFrame.stack is DataFrame.unstack, which by default unstacks the last level

stacked.unstack()
stacked.unstack(1)
stacked.unstack(0)

### Reshaping - Pivot tables

In [None]:
df = pd.DataFrame(
    {
        "A": ["one", "one", "two", "three"] * 3,
        "B": ["A", "B", "C"] * 4,
        "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 2,
        "D": np.random.randn(12),
        "E": np.random.randn(12),
    }
)
df

#pivot_table pivots a DataFrame specifying the values, index and columns



pd.pivot_table(df, values="D", index=["A", "B"], columns=["C"])




### Time Series

In [None]:
# pandas has simple, powerful, and efficient functionality for performing resampling operations during frequency conversion (e.g., converting secondly data into 5-minutely data).
# This is extremely common in, but not limited to, financial applications

rng = pd.date_range("1/1/2012", periods=100, freq="S")
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts.resample("5Min").sum()

# Series.tz_localize localizes a time series to a time zone:
rng = pd.date_range("3/6/2012 00:00", periods=5, freq="D")
ts = pd.Series(np.random.randn(len(rng)), rng)
ts
ts_utc = ts.tz_localize("UTC")
ts_utc


# Series.tz_convert converts a timezones aware time series to another time zone:
ts_utc.tz_convert("US/Eastern")

# Converting between time span representations:
rng = pd.date_range("1/1/2012", periods=5, freq="M")
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts
ps = ts.to_period()
ps
ps.to_timestamp()

# Converting between period and timestamp enables some convenient arithmetic functions to be used. 
# In the following example, we convert a quarterly frequency with year ending in November to 9am of the end of the month following the quarter end:
prng = pd.period_range("1990Q1", "2000Q4", freq="Q-NOV")
ts = pd.Series(np.random.randn(len(prng)), prng)
ts.index = (prng.asfreq("M", "e") + 1).asfreq("H", "s") + 9
ts.head()



### Categoricals

In [None]:
# pandas can include categorical data in a DataFrame
df = pd.DataFrame(
    {"id": [1, 2, 3, 4, 5, 6], "raw_grade": ["a", "b", "b", "a", "a", "e"]}
)


# Converting the raw grades to a categorical data type:
df["grade"] = df["raw_grade"].astype("category")
df["grade"]

# Rename the categories to more meaningful names:
new_categories = ["very good", "good", "very bad"]
df["grade"] = df["grade"].cat.rename_categories(new_categories)

#Reorder the categories and simultaneously add the missing categories (methods under Series.cat return a new Series by default):
df["grade"] = df["grade"].cat.set_categories(
    ["very bad", "bad", "medium", "good", "very good"]
)
df["grade"]

# Sorting is per order in the categories, not lexical order:
df.sort_values(by="grade")

# Grouping by a categorical column with observed=False also shows empty categories:
df.groupby("grade", observed=False).size()



### Plotting

In [None]:
# We use the standard convention for referencing the matplotlib API
import matplotlib.pyplot as plt
plt.close("all")

# The plt.close method is used to close a figure window:

ts = pd.Series(np.random.randn(1000), index=pd.date_range("1/1/2000", periods=1000))
ts = ts.cumsum()
ts.plot();
plt.show();

# On a DataFrame, the DataFrame.plot method is a convenience to plot all of the columns with labels:

df = pd.DataFrame(
    np.random.randn(1000, 4), index=ts.index, columns=["A", "B", "C", "D"]
)

df = df.cumsum()

plt.figure();
df.plot();
plt.legend(loc='best');

