# Continuous_probabilistic_methods

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats




Define a function named get_lower_and_upper_bounds that has two arguments. The first argument is a pandas Series. The second argument is the multiplier, which should have a default argument of 1.5.

In [None]:
def get_lower_and_upper_bounds (series, multiplier=1.5):

    # calculate our q1 and q3
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    
    # calculare lower bound and upper bound
    lower_bound = q1 - (multiplier * iqr)
    upper_bound = q3 + (multiplier * iqr)
    return lower_bound, upper_bound

In [None]:
def get_iqr(series, multiplier=1.5):

    # calculate our q1 and q3
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    
    return iqr

# 1. Using lemonade.csv dataset and focusing on continuous variables:

In [None]:
df = pd.read_csv('lemonade.csv')
df.head()

### Set the Date to to be the datetime

In [None]:
df.Date = pd.to_datetime(df.Date)
df = df.set_index('Date')

In [None]:
df['month'] = df.index.month_name()

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
columns=['Temperature', 'Rainfall','Flyers','Price','Sales']
for col in columns:
    plt.scatter(df.month, df[col])
    plt.show();

### * Use the IQR Range Rule and the upper and lower bounds to identify the lower outliers of each column of lemonade.csv, using the multiplier of 1.5. Do these lower outliers make sense? Which outliers should be kept?

In [None]:
columns=['Temperature', 'Rainfall','Flyers','Price','Sales']
for col in columns:
    lower_bound, upper_bound = get_lower_and_upper_bounds(df[col])
    print(f'{col}--lower bound: {lower_bound}, upper bound: {upper_bound}')
    

In [None]:
    
columns=['Temperature', 'Rainfall','Flyers','Price','Sales']
for col in columns:
    lower_bound, upper_bound = get_lower_and_upper_bounds(df[col])
    print(f'{col}--lower bound: {lower_bound}, upper bound: {upper_bound}')    
       
    lower = (df[(df[col])<lower_bound])
    print(lower)
    print('____________________')

In [None]:
df[df.Flyers<4]

**A negative number of Flyers does not make sense. Any negative number of flyers should not be kept.**

In [None]:
df[df.Temperature<16.7]

**Temperature that is 15.1 makes sense and can be kept as an outlier**

The lower outlier that should be taken out is a negative number of flyers. All other lower bound outliers check out at this moment and should be kept.

### * Use the IQR Range Rule and the upper and upper bounds to identify the upper outliers of each column of lemonade.csv, using the multiplier of 1.5. Do these upper outliers make sense? Which outliers should be kept?

In [None]:
    
columns=['Temperature', 'Rainfall','Flyers','Price','Sales']
for col in columns:
    lower_bound, upper_bound = get_lower_and_upper_bounds(df[col])
    print(f'{col} upper bound: {upper_bound}')    
       
    upper = (df[(df[col]) > upper_bound])
    print(upper)
    print('____________________')

In [None]:
df[df.Temperature>104.7]

**Temperature of 212 seems like a mistake and should not be kept as an outlier.**

In [None]:
df[df.Rainfall>1.3]

**Rainfall greater than 1.3 can be a normal occurance and outliers should be kept**

In [None]:
df[df.Flyers>76]

**Flyers distributed greater than 76 can be a normal occurance and outliers should be kept.**

In [None]:
df[df.Sales>45]

**Sales greater than 45 can occur.A sale of 524 seems unlikely but it is still probable. Outliers should be kept**

The upper limit outliers that should be taken out is a temperature of 212. All other outliers check out at this moment in time.

### * Using the multiplier of 3, IQR Range Rule, and the lower bounds, identify the outliers below the lower bound in each colum of lemonade.csv. Do these lower outliers make sense? Which outliers should be kept?

In [None]:
columns=['Temperature', 'Rainfall','Flyers','Price','Sales']
for col in columns:
    lower_bound, upper_bound = get_lower_and_upper_bounds(df[col],3)
    print(f'{col}--lower bound: {lower_bound}')    
       
    lower = (df[(df[col])<lower_bound])
    print(lower)
    print('____________________')

**Using a multiplier of 3 , Flyers as a negative number does not make sense in the real world and should be taken out as an outlier of the dataset.**

### * Using the multiplier of 3, IQR Range Rule, and the upper bounds, identify the outliers above the upper_bound in each colum of lemonade.csv. Do these upper outliers make sense? Which outliers should be kept?

In [None]:
columns=['Temperature', 'Rainfall','Flyers','Price','Sales']
for col in columns:
    lower_bound, upper_bound = get_lower_and_upper_bounds(df[col],3)
    print(f'{col}--upper bound: {upper_bound}')    
       
    lower = (df[(df[col])>upper_bound])
    print(lower)
    print('____________________')

**When 3 is used as a multiplier fo the upper bound the only outlier that does not check out is temperature at 212 all other outliers are part of the real world.Temperature outliers should be taken out and all others can be kept at this moment.**

## 2. Identify if any columns in lemonade.csv are normally distributed. For normally distributed columns:

* Use a 2 sigma decision rule to isolate the outliers.

    * Do these make sense?
    * Should certain outliers be kept or removed?

In [None]:
# Calculate the z-score 
zscores = pd.Series((x - x.mean()) / x.std())

# Finds all of the observations two standard deviations or more.
x[zscores.abs() >= 2]

# Finds all of the observations three standard deviations or more
x[zscores.abs() >= 3]

In [None]:
# Z scores greater than 2
for col in columns:
    print(col)
    z_scores = (df[col] - df[col].mean()) / df[col].std()
    display(pd.DataFrame(z_scores))
    df['zscores'] = z_scores
    display(pd.DataFrame(df[df['zscores'].abs() >=2][col]))
    sns.histplot(df[col])
    plt.title(f'{col} Histogram' )
    plt.show();

**With Z-score set to 2 some values that are outside of this bound seem likely and a few seem unlikely.**

# 3. Now use a 3 sigma decision rule to isolate the outliers in the normally distributed columns from lemonade.csv

In [None]:
# Z scores greater than 3
for col in columns:
    print(col)
    z_scores = (df[col] - df[col].mean()) / df[col].std()
    display(pd.DataFrame(z_scores))
    df['zscores'] = z_scores
    display(pd.DataFrame(df[df['zscores'].abs() >=3][col]))
    sns.histplot(df[col])
    plt.show();

**With z-score set at 3 outliers of temperature of 212 and a negative number of flyers should be removed. There is other outliers that show sales, rainfall outside of the z-score =3 , but it can be probable that this has occured.**

### Look into outliers

Temperature of 212 is above the normal, likely a typo. Will be corrected by replacing value with previous day

In [None]:
plt.scatter(df.month,df['Temperature'])

In [None]:
df.loc['2017-02-09':'2017-02-13']

In [None]:
df.loc[df.Temperature ==212, 'Temperature']=df.loc['2017-02-10'].Temperature

In [None]:
df.loc['2017-02-09':'2017-02-13']

Rainfall it seems possible rainfall would go to 2.5 around that time of the month

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(df.month,df['Rainfall'])

In [None]:
df[df.Rainfall>2.0]

In [None]:
df.loc['2017-12-28':'2017-12-31']

Sales , large sales seems to be attributed to July 4th.

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(df.month,df['Sales'])

In [None]:
df[df.Sales> 50]

In [None]:
df.loc['2017-07-01':'2017-07-05']

Flyers with a negative number are probably a typo

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(df.month,df['Flyers'])

In [None]:
df.loc['2017-11-18':'2017-11-25']

In [None]:
df.Flyer = df.Flyers.abs()

In [None]:
df.Flyer.describe()

### create dict for outliers

In [None]:
outliers = {}
for col in df.columns:
    # to exclude objects and strings
    if np.issubdtype(df[col].dtype, np.number):
        lower_bound, upper_bound=get_lower_and_upper_bounds(df[col])
        print(f'Lower Bounds for {col} : {lower_bound}\nUpper bounds for {col}: {upper_bound}\n')
        print('____________________')
        outliers[col]={}
        outliers[col]['bounds']={'upper':upper_bound, 'lower': lower_bound}
        outliers[col]['df']=df[(df[col] > upper_bound) | (df[col]< lower_bound)]
    else:
        pass

In [None]:
outliers['Temperature']['df']

In [None]:
outliers['Flyers']['df']