# Cheatsheet

## 0. Load Packages

In [1]:
# Importing some common libraries that’s needed for all data science related projects
import numpy as np
import pandas as pd
import math
import scipy


# Importing different modules from the sklearn library to build and evaluate the linear regression model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


# Importing matplotlib and seaborn libraries for data visualisation 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


# Switching off unnecessary warning messages 
import warnings
warnings.filterwarnings('ignore')

## 1. Import Data

In [2]:
# Import CSV 
house = pd.read_csv("HousePrices.csv")
house

Unnamed: 0,HouseSqft,Taxes,Bedrooms,Bathrooms,LastSoldPrice
0,1240.0,1360,3,2.0,145000
1,370.0,1050,1,1.0,68000
2,,1010,3,1.5,115000
3,1120.0,830,3,2.0,69000
4,1710.0,2150,3,2.0,163000
...,...,...,...,...,...
95,1060.0,990,2,2.0,176000
96,1730.0,3030,3,2.0,196500
97,1370.0,1580,3,2.0,132200
98,1560.0,1770,3,2.0,88400


In [None]:
# Import Excel
churn = pd.read_excel("Telco Customer Churn - Training Dataset.xlsx", sheet_name="Telco Customer Churn")
churn

## 2. Data Quality

In [None]:
# Head / Tail
house.head()
house.tail()
house.head(20) #specific number of records

In [None]:
# Stats
df.describe() # only numerical data

df.describe(include='all') # all data including categorical

In [1]:
df.info()

NameError: name 'churn' is not defined

In [None]:
# Count Duplicates
df.duplicated().sum()

# Show Dupes
dupes=df.duplicated(["FirstName"])
df[dupes]

In [None]:
# Count Missing Data
df.isnull().sum()
df.isna().sum()

# Show missing data
missing = pd.isnull(df["Column"])
df[missing]

In [None]:
# Slicing of Filtering Columns

churn[["Gender","Tenure"]]

In [None]:
# Slicing of Filtering Columns with ROW Criteria

churn[churn.Tenure>40][["Gender","Tenure"]]

In [None]:
# Multiple Conditions

churn[(churn.Tenure>40) & (churn.Gender == "Male")][["Gender","Tenure"]]  # and

churn[(churn.Tenure>40) | (churn.Gender == "Male")][["Gender","Tenure"]]  # or

In [None]:
# Loc Method inclusive on both ends

churn.loc[0:4 , ["Gender", "Tenure"]]

In [None]:
churn.loc[churn.Tenure>40 , ["Gender","Tenure"]]

## 3. Data Cleansing

In [None]:
# Drop Duplicates
df = df.drop_duplicates()

In [None]:
# Fix Missing Data - Remove
df = df.dropna()

In [None]:
# Fix Missing Data - Interpolation 

# INTERPOLATION TECHNIQUE - NULLS DEFAULT TO ZERO
churn.fillna(0, inplace=False)


# INTERPOLATION - NEAREST NEIGHBOUR (FORWARD TECHNIQUE)
churn.fillna(method ='ffill', inplace=False)


# INTERPOLATION - NEAREST NEIGHBOUR (BACKWARD TECHNIQUE)
churn.fillna(method ='bfill', inplace=False) 


# INTERPORLATION - NULL DEFAULT TO ZERO - ALTERNATIVE TO FILLNA
churn.replace(to_replace = np.nan, value = 0, inplace=False)


# INTERPORLATION - IDENTIFIES AN AVERAGE (LINEAR DISTRIBUTION)
churn.interpolate(method ='linear', limit_direction ='forward', inplace=False) 

In [None]:
# JOINS

# Let's do a left join, this is defined by how="left" 
# use "CustomerID" and "CusID" columns as the common column
# For inner, right, or full join, just change the "how" to the appropriate join

df = pd.merge(CustomerInfo, 
                        OrderInfo, 
                        how="left", 
                        left_on="CustomerID", 
                        right_on="CusID")

In [None]:
# UNIONS

# Let's import the "CustomerInfo" dataset
FY2020 = pd.read_excel("2020.xlsx")


# Let's import the "CustomerInfo2" dataset
FY2021 = pd.read_excel("2021.xlsx")


all_rows = pd.concat([FY2020, FY2021])

In [None]:
# CHANGE DATATYPES

df["timestamp"] = df.timestamp.astype("datetime64")
df["year"] = df.year.astype("category")
df["t1"] = df.t1.astype("float64")
df['DataFrame Column'] = df['DataFrame Column'].astype(str)

## 4. EDA

### A. Aggregation

In [None]:
# Averages across all numerical data

df.mean()

In [None]:
# Sum by Specific Column

df.column.sum()

In [None]:
# Multi-Stats use agg()

df.column.agg(['sum', 'mean'])

In [None]:
# Statistic - MultiTarget - MultiStats

df.agg({"Tenure": ['mean', 'count'], "MonthlyCharges" : ['mean', 'count']})

### B. Distribution

In [None]:
# Distribution using groupby()

churn.groupby(by="Gender").mean()

In [None]:
# Grouping by multiple columns
# To extend groupby() to work with multiple grouping variables, 
# pass a list of column names to groupby() instead of a single string value

churn.groupby(by=["Gender","PaymentMethod"]).median()

In [None]:
# agg() function example1

churn.groupby(by="Gender").agg({"MonthlyCharges": ['mean', 'min', 'max']}).dropna()

### C. Correlation

### D. Visualisation

In [None]:
# Outlier detection using boxplot from seaborn library

sns.boxplot(data=df[["HouseSqft"]])
plt.show()