# Python Data Analysis Cheat Sheet

## Common Libraries to Import

```py
# Core Libraries
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Statistical Analysis
from scipy import stats
from scipy.stats import iqr, ttest_ind, pearsonr, trim_mean, chi2_contingency

# Machine Learning & Modeling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, accuracy_score

# Data Preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder
```

## Data Exploration

```py
df.head()               # First 5 rows
df.info()               # Data types & non-null counts
df.describe()           # Summary statistics
df.columns              # Column names
df.shape                # Rows and columns
df.isnull().sum()       # Missing values
df.column.value_counts()       # List the unique values in a column and the count of each value
df.column.value_counts(normalize=True)  # List the unqiue values in a column and the proportion of each value
pd.crosstab(df.column1, df.column2)     # Crosstab function from pandas to create a contingency table
crosstab_object / len(df)               # Converting the crosstab object to proportional values
crosstab_prop_object.sum(axis = 0)      # Get marginal proportion of the Y axis
crosstab_prop_object.sum(axis = 1)      # Get marginal proprotion of the X axis
```

## Data Cleaning

```py
df.dropna()                         # Drop missing values
df.fillna(value)                    # Fill missing values
df.duplicated().sum()               # Count duplicates
df.drop_duplicates(inplace=True)    # Remove duplicates

```

## Visualization

```py
sns.histplot(x = 'column', data = df)                   # Histogram
                                                        # Overlapping Histograms
plt.hist(df, color = 'blue', label = 'category1', density = True, alpha = 0.5)
plt.hist(df, color = 'red', label = 'category2', density = True, alpha = 0.5)
plt.legend()

sns.boxplot(x='column', data=df)                        # Single Boxplot
sns.boxplot(x = 'category', y = 'values', data = df)    # Side by Side Boxplots
sns.scatterplot(x='x', y='y', data=df)                  # Scatter plot
sns.heatmap(df.corr(), annot=True)                      # Correlation heatmap
sns.countplot(x = 'column', data = df)                  # Bar chart
df.column.value_counts().plot.pie()                     # Pie chart
```

## Statistical Analysis

```py
df.column.mean()                    # Mean
df.column.median()                  # Median
df.column.mode()                    # Mode
trim_mean(df.column, proportiontocut=0.1)   # Trimmed mean
df.column.max() - df.column.min()   # Range
iqr(df['column'])                   # Interquartile range
df.column.var()                     # Variance
df.column.std()                     # Standard Deviation
df.column.mad()                     # Mean Absolute Deviation
np.cov(df.column1, df.column2)      # Covariance
var, p = pearsonr(df.column1, df.column2)   # Pearson correlation
chi2, pval, dof, expected = chi2_contingency(contingency_table)     # The Chi-Square Statistic
chi2 = chi2_contingency(contingency_table)[0]
pval = chi2_contingency(contingency_table)[1]
dof = chi2_contingency(contingency_table)[2]
expected = chi2_contingency(contingency_table)[3]

random = np.random.choice(a, size = size, replace = True/False)     # Numpy function to generate a random value

stats.binom.pmf(x, n, p)            # Calculate the PMF of the binomial distribution of any value | x = the value of interest | n = the num of trials | p = the probability of success
stats.binom.cdf(x, n, p)            # Calculate the CDF of the binomial distribution of any value | x = the value of interest (probability of this value or less) | n = the sample size | p = the probability of success
stats.norm.cdf(x, loc, scale)       # Calculate the CDF of the normal distribution | x = the value of interest | loc = the mean of the probability distribution | scale = the standard deviation of the probably distribution
stats.poisson.pmf(p, lambda)        # Calculate the PMF of the Poisson distribution | p = the probability of observing the expected value | lambda = the expected value
stats.poisson.cdf(p, lambda)        # Calculate the CDF of the Poisson distribution | p = the probability of observing the expected value | lambda = the expected value
stats.poisson.rvs(lambda, size = num_values)    # Generate random variants from the Poisson distribution
np.var(distribution)                            # Calculate the variance of a sample using numpy
min(distribution)
max(distribution)
```

## Machine Learning Basics

```py
X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
```

## Preprocessing

```py
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

encoder = LabelEncoder()
df['encoded'] = encoder.fit_transform(df['categorical_column'])
```