<a href="https://colab.research.google.com/github/Zabiullahkhan/Data_Science/blob/main/Pandas_Cheat_Sheet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pandas Cheat Sheet
### Getting Started
Import Pandas

#### Create a series:


In [2]:
import pandas as pd
s = pd.Series([1, 2, 3],

index=['A', 'B', 'C'],
name='col1')

#### Create a dataframe:

In [3]:
data = [[1, 4], [2, 5], [3, 6]]
index = ['A', 'B', 'C']
df = pd.DataFrame(data, index=index,
columns=['col1', 'col2'])

#### Load a dataframe:

In [None]:
df = pd.read_csv('filename.csv', sep=',',
names=['col1', 'col2'],
index_col=0,
encoding='utf-8',
nrows=3)

# Selecting rows and columns

###Select single column:

In [None]:
df['col1']

Select multiple columns:

In [None]:
df[['col1', 'col2']]

Show first n rows:

In [None]:
df.head(2)

Show last n rows:

In [None]:
df.tail(2)

Select rows by index values:

In [None]:
df.loc['A'] df.loc[['A', 'B']]

Select rows by position:

In [None]:
df.loc[1] df.loc[1:]

# Data wrangling

Filter by value:

In [None]:
df[df['col1'] > 1]

Sort by columns:

In [None]:
df.sort_values(['col2', 'col2'],
ascending=[False, True])

Identify duplicate rows:

In [None]:
df.duplicated()

Identify unique rows:

In [None]:
df['col1'].unique()

Swap rows and columns:

In [None]:
df = df.transpose()
df = df.T

Drop a column:

In [None]:
df = df.drop('col1', axis=1)

Clone a data frame:

In [None]:
clone = df.copy()

Connect multiple data frames vertically:

In [None]:
df2 = df + 5 #new dataframe
pd.concat([df,df2])

Merge multiple data frames horizontally:

In [None]:
df3 = pd.DataFrame([[1, 7],[8,9]],
index=['B', 'D'],
columns=['col1', 'col3'])
#df3: new dataframe

Only merge complete rows (INNER JOIN):

In [None]:
df.merge(df3)

Left column stays complete (LEFT OUTER JOIN):

In [None]:
df.merge(df3, how='left')

Right column stays complete (RIGHT OUTER JOIN):

In [None]:
df.merge(df3, how='right')

Merge rows by index:

In [None]:
df.merge(df3,left_index=True,
right_index=True)

Fill NaN values:

In [None]:
df.fillna(0)

Apply your own function:

In [None]:
def func(x):
  return 2**x
df.apply(func)

# Arithmetics and statistics

Add to all values:

In [None]:
df + 10

Sum over columns:

In [None]:
df.sum()

Cumulative sum over columns:

In [None]:
df.cumsum()

Mean over columns:

In [None]:
df.mean()

Standard deviation over columns:

In [None]:
df.std()

Count unique values:

In [None]:
df['col1'].value_counts()

Summarize descriptive statistics:

In [None]:
df.describe()

# Hierarchical indexing

Create hierarchical index:

In [None]:
df.stack()

Dissolve hierarchical index:

In [None]:
df.unstack()

# Aggregation

Create group object:

In [None]:
g = df.groupby('col1')

Iterate over groups:

In [None]:
for i, group in g:
  print(i, group)

Aggregate groups:

In [None]:
g.sum()
g.prod()
g.mean()
g.std()
g.describe()

Select columns from groups:

In [None]:
g['col2'].sum()
g[['col2', 'col3']].sum()

Transform values:

In [None]:
import math
g.transform(math.log)

Apply a list function on each group:

In [None]:
def strsum(group):
  return ''.join([str(x) for x in group.value])
  g['col2'].apply(strsum)

# Data export

Data as NumPy array:

In [None]:
df.values

Save data as CSV file:

In [None]:
df.to_csv('output.csv', sep=",")

Format a dataframe as tabular string:

In [None]:
df.to_string()

Convert a dataframe to a dictionary:

In [None]:
df.to_dict()

Save a dataframe as an Excel table:

In [None]:
df.to_excel('output.xlsx')

# Visualization

Import matplotlib:

In [None]:
import matplotlib.pyplot as plt

Start a new diagram:

In [None]:
plt.figure()

Scatter plot:

In [None]:
df.plot.scatter('col1', 'col2',
  style='ro')

Bar plot:

In [None]:
df.plot.bar(x='col1', y='col2',
width=0.7)

Area plot:

In [None]:
df.plot.area(stacked=True,
alpha=1.0)

Box-and-whisker plot:

In [None]:
df.plot.box()

Histogram over one column:

In [None]:
df['col1'].plot.hist(bins=3)

Histogram over all columns:

In [None]:
df.plot.hist(bins=3, alpha=0.5)

Set tick marks:

In [None]:
labels = ['A', 'B', 'C', 'D']
positions = [1, 2, 3, 4]
plt.xticks(positions, labels)
plt.yticks(positions, labels)

Select area to plot:

In [None]:
plt.axis([0, 2.5, 0, 10]) # [from x, to x, from y, to y]

Label diagram and axes:

In [None]:
plt.title('Correlation')
plt.xlabel('Nunst√ºck')
plt.ylabel('Slotermeyer')

Save most recent diagram:

In [None]:
plt.savefig('plot.png')
plt.savefig('plot.png',dpi=300)
plt.savefig('plot.svg')

# Thanks
      by
         Zabiullah Khan