# Python Data Analysis Cheat Sheet

## Common Libraries to Import

```py
# Core Libraries
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import statsmodels.api as sm

# Statistical Analysis
from scipy import stats
from scipy.stats import iqr, ttest_ind, pearsonr, trim_mean, chi2_contingency, ttest_1samp, binom_test

# Machine Learning & Modeling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, accuracy_score

# Data Preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder
```

## Data Exploration

```py
df.head()               # First 5 rows
df.info()               # Data types & non-null counts
df.describe()           # Summary statistics
df.columns              # Column names
df.shape                # Rows and columns
df.isnull().sum()       # Missing values
df.column.value_counts()       # List the unique values in a column and the count of each value
df.column.value_counts(normalize=True)  # List the unqiue values in a column and the proportion of each value
pd.crosstab(df.column1, df.column2)     # Crosstab function from pandas to create a contingency table
crosstab_object / len(df)               # Converting the crosstab object to proportional values
crosstab_prop_object.sum(axis = 0)      # Get marginal proportion of the Y axis
crosstab_prop_object.sum(axis = 1)      # Get marginal proprotion of the X axis
```

## Data Cleaning

```py
df.dropna()                         # Drop missing values
df.fillna(value)                    # Fill missing values
df.duplicated().sum()               # Count duplicates
df.drop_duplicates(inplace=True)    # Remove duplicates

```

## Visualization

### Line Plot

#### Using `matplotlib`

```py
# Single line plot
plt.plot(x_axis_list, y_axis_list)
plt.show()

# Multi-line plot. Matplotlib will automatically place the two lines on the same axes and give them different colors if you call plt.plot() twice.
plt.plot(x_axis_list, y_axis_list)
plt.plot(x_axis_list, y_axis_list_2)
plt.show()

# Linestyles keywords
color = 'green' or '#AAAAAA'      # We can specify a different color for a line by using the keyword color with either an HTML color name or a Hex code
linestyle = '--' or ':' or ''       # Change the linestyle to dotted or dashed
marker = 'o' or 's' or '*'          # Add a line marker like a dot or a square

# Shaded error region
lower_bound_y = [i - 2 for i in y_axis_list]    # represents an error of 2 for each y value
upper_bound_y = [i + 2 for i in y_axis_list]
plt.fill_between(x_axis_list, lower_bound_y, upper_bound_y, alpha=0.2)      # this is the shaded region
plt.plot(x_axis_list, y_axis_list)      # this is the line itself
```

### Bar Chart

#### Using `matplotlib`

```py
# Bar Chart
plt.bar(x_axis_list, y_axis_list)

# Side by side bar chart
[t*element + w*n for element in range(d)]
    # China Data (blue bars)
    n = 1  # This is our first dataset (out of 2)
    t = 2 # Number of datasets
    d = 7 # Number of sets of bars
    w = 0.8 # Width of each bar
    x_values1 = [t*element + w*n for element in range(d)]
    # China Data (blue bars)
    n = 1  # This is our first dataset (out of 2)
    t = 2 # Number of datasets
    d = 7 # Number of sets of bars
    w = 0.8 # Width of each bar
    x_values1 = [t*element + w*n for element in range(d)]

# Stacked bar chart
plt.bar(x_axis_list, y_axis_list1)
plt.bar(x_axis_list, y_axis_list2, bottom=y_axis_list1)

# Error bars
plt.bar(x_axis_list, y_axis_list, yerr=error_value, capsize=10)     # yerr can be a single value (used on all bars) or a string of values for each bar
```

#### Using `Seaborn`

```py
# Using .countplot()
sns.countplot(dataset)
sns.countplot(x = 'column', data = df)

# Keywords
order=df["victory_status"].value_counts(ascending=True).index       # for nominal data
order=["First Year", "Second Year", "Third Year", "Fourth Year"])   # for ordinal data

# Using .barplot()
sns.barplot(x_axis_list, y_axis_list)
```

### Pie Chart

#### Using `matplotlib`

```py
plt.pie(list_values)
plt.axis('equal')
plt.legend(list_names)

# Keywords
labels=list_names
autopct='%0.1f%%'       # set the "percentage" formatting
                        # '%0.2f' — 2 decimal places, like 4.08
                        # '%0.2f%%' — 2 decimal places, but with a percent sign at the end, like 4.08%. You need two consecutive percent signs because the first one acts as an escape character, so that the second one gets displayed on the chart.
                        # '%d%%' — rounded to the nearest int and with a percent sign at the end, like 4%.
```

### Histogram

#### Using `matplotlib`

```py
plt.hist(dataset)       # Creates histogram with 10 bins by default
plt.hist(dataset, range=(min, max), bins=num_bins)      # limits the histogram to a certain range of the dataset and show a certain num of bins

# Overlapping Histograms
plt.hist(df, color = 'blue', label = 'category1', density = True, alpha = 0.5)
plt.hist(df, color = 'red', label = 'category2', density = True, alpha = 0.5)

# Keywords
histtype='step'     # use this to generate an outline of the histogram
```

#### Using `Seaborn`

```py
sns.histplot(x = 'column', data = df)                   # Histogram
```

### Plot Formatting

#### Using `matplotlib`

```py
# Axis and labels
plt.axis([x_min, x_max, y_min, y_max])      # Change the scale of the axes
plt.xlabel('x_axis_label')          # Label the x axis
plt.ylabel('y_axis_label')          # Label the y axis
plt.title('plot_title')             # Set the title of the plot

# Subplots
plt.subplot(num_rows, num_cols, index)      # Any plt.plot() which comes after plt.subplot() will create a line plot in the specified subplot
    # First Subplot
    plt.subplot(1, 2, 1)
    plt.plot(x, y, color='green')
    plt.title('First Subplot')

    # Second Subplot
    plt.subplot(1, 2, 2)
    plt.plot(x, y, color='steelblue')
    plt.title('Second Subplot')

    # Display both subplots
    plt.show()
plt.subplots_adjust(left=0, right=0, bottom=0, top=0, wspace=0, hspace=0)

# Legends
plt.legend(['plot 1', 'plot 2'], loc=0)     # Keyword 'loc' positions the legend on the figure. 0 = 'best'

# Axes
ax = plt.subplot()
ax.set_xticks([1, 2, 4])
ax.set_yticks([0.1, 0.6, 0.8])
ax.set_yticklabels(['10%', '60%', '80%'], rotation=30)      # use the rotation keyword to rotate labels by a number of debrees

# Figures
plt.figure(figsize=(4, 10))                 # Set the size of the figure
plt.plot(x, parabola)
plt.savefig('tall_and_narrow.png')          # Save the figure to a specific format

```

#### Using `Seaborn`

```py
sns.boxplot(x='column', data=df)                        # Single Boxplot
sns.boxplot(x = 'category', y = 'values', data = df)    # Side by Side Boxplots
sns.scatterplot(x='x', y='y', data=df)                  # Scatter plot
sns.heatmap(df.corr(), annot=True)                      # Correlation heatmap

df.column.value_counts().plot.pie()                     # Pie chart
```

### Linear Regression Model

```py
model = sm.OLS.from_formula('weight ` height', data = body_measurements)        # Ordinary Least Squares (OLS) function
results = model.fit()                                   # 
print(results.summary())
print(results.params)

newdata = {'height':[160]}                              # make a new set of data
print(results.predict(newdata))                         # Use the model to make a prediction given a new set of data

fitted_values = results.predict(body_measurements)      # We can calculate the fitted values using .predict() by passing in the original data.
residuals = body_measurements.weight - fitted_values    # Residuals are the differences between each fitted value and the true value
```

## Statistical Analysis

```py
df.column.mean()                    # Mean
df.column.median()                  # Median
df.column.mode()                    # Mode
trim_mean(df.column, proportiontocut=0.1)   # Trimmed mean
df.column.max() - df.column.min()   # Range
iqr(df['column'])                   # Interquartile range
df.column.var()                     # Variance
df.column.std()                     # Standard Deviation
df.column.mad()                     # Mean Absolute Deviation
np.cov(df.column1, df.column2)      # Covariance
var, p = pearsonr(df.column1, df.column2)   # Pearson correlation
chi2, pval, dof, expected = chi2_contingency(contingency_table)     # The Chi-Square Statistic
chi2 = chi2_contingency(contingency_table)[0]
pval = chi2_contingency(contingency_table)[1]
dof = chi2_contingency(contingency_table)[2]
expected = chi2_contingency(contingency_table)[3]

random = np.random.choice(given_values, size = size, replace = True/False)     # Numpy function to generate a random value

stats.binom.pmf(x, n, p)            # Calculate the PMF of the binomial distribution of any value | x = the value of interest | n = the num of trials | p = the probability of success
stats.binom.cdf(x, n, p)            # Calculate the CDF of the binomial distribution of any value | x = the value of interest (probability of this value or less) | n = the sample size | p = the probability of success
stats.norm.cdf(x, loc, scale)       # Calculate the CDF of the normal distribution | x = the value of interest | loc = the mean of the probability distribution | scale = the standard deviation of the probably distribution
stats.poisson.pmf(p, lambda)        # Calculate the PMF of the Poisson distribution | p = the probability of observing the expected value | lambda = the expected value
stats.poisson.cdf(p, lambda)        # Calculate the CDF of the Poisson distribution | p = the probability of observing the expected value | lambda = the expected value
stats.poisson.rvs(lambda, size = num_values)    # Generate random variants from the Poisson distribution
np.var(distribution)                            # Calculate the variance of a sample using numpy
min(distribution)
max(distribution)

tstat, pval = ttest_1samp(sample_distribution, expected_mean)       # Implementing a one-sample T-Test
p_value = binom_test(num_observed_successes, n=num_of_trials, p=expected_probability_of_success)    # Two-sided (2-sided) Binomial Testing with SciPy
                                                                                                    # use `alternative='less'` to run a 1-sided test
```

### Central Limit Theorem (CLT)
The `population` object is a list containing all wages in the full population.

In each iteration of the loop, we do the following:
- take a random sample of 150 wages from the population
- store the sample mean in a list called `sample_means`

After collecting 10k sample means, we inspect them using a histogram.
```py
import numpy as np
import matplotlib.pyplot as plt
import random

sample_means = []

for i in range(10000):                          # the for loop whic iterates 10k times to collect 10k sample means
    samp = random.sample(population, 150)       # using the `.sample()` method of random to get a sample from `population` which has 150 individual wages
    sample_means.append(np.mean(samp))          # append each sample population to the `sample_means` list object

plt.hist(sample_means, bins = 30)
plt.vlines(np.mean(sample_means), 0, 1000, lw=3, linestyles='dashed')
```

While a researcher or data scientist probably does not know the population standard deviation, they can use the standard deviation of their sample to estimate it.

Let’s return to the data scientist who collected a single sample of 150 wages and calculated an average wage of 17.74 dollars. To quantify the uncertainty around this sample mean, this data scientist can first estimate the standard error:
```py
std_error = np.std(my_sample)/(150 ** 0.5)      # the standard deviation (STD) of the sample mean is calculated as the population deviation divided by the square root of the sample size
                                                # n ** 0.5 is an expression for square root
print(std_error)
# output: 1.275
```

Then, leveraging the part of the CLT that says the sampling distribution is normally distributed, our data scientist can use a nifty property of normal distributions: 95% of normally distributed values are within about 1.96 standard deviations of the mean. This allows the data scientist to estimate the width of the sampling distribution above, without actually knowing the population distribution!

First, the data scientist needs to multiply 1.96 by the estimated standard error: 1.96 * 1.275 = 2.50. The interpretation of this number is as follows:
- Imagine taking a large number of samples of size 150 from a population with the same amount of variation as in the observed sample.
- 95% of those samples would be within about 2.50 dollars from the true population mean.
- Therefore, there is about a 95% probability that the observed sample mean of 17.74 is no more than 2.50 dollars away from the population mean. In other words, there is about a 95% probability that the population mean is between 15.24 and 20.24. This is referred to as a 95% confidence interval.

Note that the estimate of 2.50 is similar to the value of 2.87 that we calculated as all-knowing beings with access to the full population. With only a single sample in hand, the data scientist can express the uncertainty in their sample mean fairly accurately!

### Writing a Binomial Test Function
```py
import numpy as np
import pandas as pd
from scipy.stats import binom_test

def simulation_binomial_test(observed_successes, n, p):
  #initialize null_outcomes
  null_outcomes = []
  
  #generate the simulated null distribution
  for i in range(10000):
    simulated_monthly_visitors = np.random.choice(['y', 'n'], size=n, p=[p, 1-p])
    num_purchased = np.sum(simulated_monthly_visitors == 'y')
    null_outcomes.append(num_purchased)

  #calculate a 1-sided p-value
  null_outcomes = np.array(null_outcomes)
  p_value = np.sum(null_outcomes <= observed_successes)/len(null_outcomes) 
  
  #return the p-value
  return p_value

#Test your function below by uncommenting the code below. You should see that your simulation function gives you a very similar answer to the binom_test function from scipy:

p_value1 = simulation_binomial_test(45, 500, .1)
print("simulation p-value: ", p_value1)

p_value2 = binom_test(45, 500, .1, alternative = 'less')
print("binom_test p-value: ", p_value2)
```

## Machine Learning Basics

```py
X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
```

## Preprocessing

```py
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

encoder = LabelEncoder()
df['encoded'] = encoder.fit_transform(df['categorical_column'])
```