In [1]:
# for vectorized operations
import numpy as np

# for dataframe manipulation
import pandas as pd

# for vizualizations
import matplotlib.pyplot as plt
import seaborn as sns

# for statistical calculations
import scipy.stats as stats

# for obtaining stock datasets
from pydataset import data

# for manipulation of time data
from datetime import date

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import sklearn as sk

# filter out warnings
import warnings
warnings.filterwarnings('ignore')

# our own functions for accessing our sql database
from env import get_db_url, user, password, host

# our own scripts
import acquire
import prepare
import explore

# pandas display preferences
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 3)
#pd.option_context('display.max_rows', None)


#### 4. use a python module (pydata or seaborn datasets) containing datasets as a source from the iris data. Create a pandas dataframe, df_iris, from this data.

In [None]:
df = data('iris')

#### 4a.     print the first 3 rows


In [None]:
df.head(3)

#### 4b.     print the number of rows and columns (shape)


In [None]:
df.shape

#### 4c.     print the column names


In [None]:
for column in df.columns:
    print(column)

#### 4d. print the data type of each column

In [None]:
df.dtypes

#### 4e.     print the summary statistics for each of the numeric variables


In [None]:
df.describe()

#### 5. Read the Table1_CustDetails table from your spreadsheet exercises google sheet into a dataframe named df_google_sheets.

In [None]:
url = 'https://docs.google.com/spreadsheets/d/1gb4xDK4WmoM0kBTOiurSNZzz3cQE1IVnRQ59YsUmTmw/export?format=csv#gid=1023018493'
df_google_sheets = pd.read_csv(url)

In [None]:
df_google_sheets.head(3)

#### 5a. assign the first 100 rows to a new dataframe, df_google_sheets_sample

In [None]:
df_google_sheets_sample = df_google_sheets.head(100)

#### 5b. print the number of rows of your original dataframe

In [None]:
df_google_sheets.shape[0]

#### 5c. print the first five column names

In [None]:
for column in df_google_sheets.columns[:5]:
    print(column)

#### 5d. print the column names that have a data type of object

In [None]:
for column in df_google_sheets.dtypes[df_google_sheets.dtypes == 'object'].index:
    print(column)

#### 5e. compute the range for each of the numeric variables

In [None]:
ranges = df_google_sheets.select_dtypes('number')
ranges = ranges.max() - ranges.min()
ranges

#### 6. Download your spreadsheet exercises google sheet as an excel file (File → Download → Microsoft Excel). Read the Table1_CustDetails worksheet into a dataframe named df_excel.

In [None]:
df_excel = pd.read_excel('df_excel.xlsx', sheet_name='Table1_CustDetails')
df_excel.head(3)

#### 6a. assign the first 100 rows to a new dataframe, df_excel_sample

In [None]:
df_excel_sample = df_excel.head(100)

#### 6b. print the number of rows of your original dataframe

In [None]:
df_excel.shape[0]

#### 6c. print the first 5 column names

In [None]:
for column in df_excel.columns[:5]:
    print(column)

#### 6d. print the column names that have a data type of object

In [None]:
for column in df_google_sheets.select_dtypes(include=[object]).columns:
    print(column)

#### 6e. compute the range for each of the numeric variables.

In [None]:
ranges = df_google_sheets.select_dtypes(exclude=[object])
ranges = ranges.max() - ranges.min()
ranges

#### 7. read the data from the given google sheet into a dataframe, df_google

In [None]:
sheet_id = '1Uhtml8KY19LILuZsrDtlsHHDC9wuDGUSe8LTEwvdI5g'
sheet_name = 'train'

url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"

df_google = pd.read_csv(url)

#### 7a. Print the first 3 rows

In [None]:
df_google.head(3)

#### 7b. print the number of rows and columns 

In [None]:
df_google.shape

#### 7c. print the column names

In [None]:
for column in df_google.columns:
    print(column)

#### 7d. print the data type of each column

In [None]:
df_google.dtypes

#### 7e. print the summary statistics for each of the numeric variables

In [None]:
df_google.describe()

#### 7f. Print the unique values for each of your categorical variables

In [None]:
categorical_columns = list(df_google.nunique()[df_google.nunique() < 10].index)
for i in range(df_google[categorical_columns].shape[1]):
    print(f'{df_google[categorical_columns].iloc[:, i].name}: {(df_google[categorical_columns].iloc[:, i].unique())}')

In [None]:
#simpler way:
categorical_df = df_google.loc[:,df_google.nunique()<10]
for col in categorical_df.columns:
    print(f'{col}: {(categorical_df[col].unique())}')

### 1. Make a function named get_titanic_data that returns the titanic data from the codeup data science database as a pandas data frame. Obtain your data from the Codeup Data Science Database. 

In [None]:
import os
from env import get_db_url, user, password, host

def get_titanic_data():
    
    filename = 'titanic.csv'
    
    if os.path.exists(filename):
        print('Reading from local CSV...')
        return pd.read_csv(filename)
    
    url = get_db_url('titanic_db')
    sql = '''
    SELECT * FROM passengers
    '''
    
    print('No local file exists\nReading from SQL database...')
    df = pd.read_sql(sql, url)

    print('Saving to local CSV... ')
    df.to_csv(filename, index=False)
    
    return df

### 2. Make a function named get_iris_data that returns the data from the iris_db on the codeup data science database as a pandas data frame. The returned data frame should include the actual name of the species in addition to the species_ids. Obtain your data from the Codeup Data Science Database.


In [None]:
def get_iris_data():
    
    filename = 'iris.csv'
    
    if os.path.exists(filename):
        print('Reading from local CSV...')
        return pd.read_csv(filename)
        
    url = get_db_url('iris_db')
    sql = '''
    SELECT *
      FROM species
      JOIN measurements USING(species_id);
    '''
    
    print('No local file exists\nReading from SQL database...')
    df = pd.read_sql(sql, url)
    
    print('Saving to local CSV...')
    df.to_csv(filename, index=False)
    
    return df

### 3. Make a function named get_telco_data that returns the data from the telco_churn database in SQL. In your SQL, be sure to join all 4 tables together, so that the resulting dataframe contains all the contract, payment, and internet service options. Obtain your data from the Codeup Data Science Database. 

In [None]:
def get_telco_data():
    
    filename = 'telco_chun.csv'
    
    if os.path.exists(filename):
        print('Reading from local CSV...')
        return pd.read_csv(filename)
    
    url = get_db_url('telco_churn')
    sql = '''
    SELECT * 
      FROM customers
        JOIN contract_types USING(contract_type_id)
        JOIN internet_service_types USING(internet_service_type_id)
        JOIN payment_types USING(payment_type_id)
    '''
    
    print('No local file exists\nReading from SQL database...')
    df = pd.read_sql(sql, url)
    
    print('Saving to local CSV...')
    df.to_csv(filename, index=False)
    
    return df

### 4. Once you've got your get_titanic_data, get_iris_data, and get_telco_data functions written, now it's time to add caching to them. To do this, edit the beginning of the function to check for the local filename of telco.csv, titanic.csv, or iris.csv. If they exist, use the .csv file. If the file doesn't exist, then produce the SQL and pandas necessary to create a dataframe, then write the dataframe to a .csv file with the appropriate name. 

# Data Preparation

In [None]:
import acquire

### Using the Iris Data: 

#### 1. Use the function defined in acquire.py to load the iris data.


In [None]:
df = acquire.get_iris_data()

#### 2. Drop the species_id and measurement_id columns

In [None]:
df = df.drop(columns=['species_id', 'measurement_id'])

#### 3. Rename the species_name column to just species.


In [None]:
df = df.rename(columns={'species_name': 'species'})

#### 4. Create dummy variables of the species name and concatenate onto the iris dataframe. (This is for practice, we don't always have to encode the target, but if we used species as a feature, we would need to encode it).


In [None]:
dummy_df = pd.get_dummies(df['species'], drop_first=True)
df = pd.concat([df, dummy_df], axis=1)

#### 5. Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied.


In [None]:
def prep_iris(df):
    df = df.drop(columns=['species_id', 'measurement_id'])
    df = df.rename(columns={'species_name': 'species'})
    dummy_df = pd.get_dummies(df['species'], drop_first=True)
    df = pd.concat([df, dummy_df], axis=1)    
    return df

### Using the Titanic dataset

#### 1. Use the function defined in acquire.py to load the Titanic data.


In [None]:
df = acquire.get_titanic_data()

In [None]:
df.head(3)

In [None]:
df.info()

In [None]:
df.alone.value_counts()

In [None]:
df.alone[(df.parch == 0) & (df.sibsp == 0)].value_counts()

In [None]:
df.alone[(df.parch > 0) | (df.sibsp > 0)].value_counts()

#### 2. Drop any unnecessary, unhelpful, or duplicated columns

In [None]:
# drop duplicate rows, if they exist:
df = df.drop_duplicates()

In [None]:
# dropping columns

# class gives the same info as pclass
# embarked gives the same info as embarked_town
# deck has too many missing values
# age has too many missing values
# alone gives duplicate info for the combination of parch and sibsp (see above)
# passenger_id is simply an index

df = df.drop(columns=['class', 'embarked', 'deck', 'age', 'alone', 'passenger_id'])

#### 3. Encode the categorical columns. Create dummy variables of the categorical columns and concatenate them onto the dataframe.

In [None]:
df.head()

In [None]:
categorical_columns = ['sex', 'embark_town']

for col in categorical_columns:
    dummy_df = pd.get_dummies(df[col],
                              prefix=df[col].name,
                              drop_first=True,
                              dummy_na=False)
    df = pd.concat([df, dummy_df], axis=1)
    # drop original column
    df = df.drop(columns=col)

#### 4. Create a function named prep_titanic that accepts the raw titanic data, and returns the data with the transformations above applied.

In [None]:
def prep_titanic(df):

    # drop duplicate rows, if they exist:
    df = df.drop_duplicates()

    # drop unnecessary columns
    df = df.drop(columns=['class', 'embarked', 'deck', 'age', 'alone', 'passenger_id'])

    # encode categorical columbns with dummy variables then drop the original columns
    categorical_columns = ['sex', 'embark_town']
    for col in categorical_columns:
        dummy_df = pd.get_dummies(df[col],
                                  prefix=df[col].name,
                                  drop_first=True,
                                  dummy_na=False)
        df = pd.concat([df, dummy_df], axis=1)
        df = df.drop(columns=col)
        
    return df

### Using the Telco dataset

#### 1. Use the function defined in acquire.py to load the Telco data.


In [None]:
df = acquire.get_telco_data()

In [None]:
df.head(3)

In [None]:
df.info()

In [None]:
df.shape

In [None]:
# drop duplicate rows, if present
df = df.drop_duplicates()

In [None]:
# clean up total_charges column and cast as float
df['total_charges'] = df.total_charges.replace(' ', np.nan).astype(float)

In [None]:
# removing brand new customers
df = df[df.tenure != 0]

#### 2. Drop any unnecessary, unhelpful, or duplicated columns. This could mean dropping foreign key columns but keeping the corresponding string values, for example.


In [None]:
# type_id columns are simply foreign key columns that have corresponding string values
# customer_id is a primary key that is not useful for our analysis
df = df.drop(columns=['payment_type_id', 'internet_service_type_id', 'contract_type_id', 'customer_id'])

#### 3. Encode the categorical columns. Create dummy variables of the categorical columns and concatenate them onto the dataframe.


In [None]:
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)
categorical_columns

In [None]:
for col in categorical_columns:
    dummy_df = pd.get_dummies(df[col],
                              prefix=df[col].name,
                              drop_first=True,
                              dummy_na=False)
    df = pd.concat([df, dummy_df], axis=1)
    df = df.drop(columns=col)

In [None]:
df.dtypes

#### 4. Create a function named prep_telco that accepts the raw telco data, and returns the data with the transformations above applied.

In [None]:
def prep_telco(df):
    
    # drop duplicate rows, if present
    
    df = df.drop_duplicates()
    
    # clean up total charges column and cast as float
    df['total_charges'] = df.total_charges.replace(' ', np.nan).astype(float)
    
    # removing brand new customers
    df = df[df.tenure != 0]
    
    # drop columns:
    
    # *_type_id columns are simply foreign key columns that have corresponding string values
    # customer_id is a primary key that is not useful for our analysis
    df = df.drop(columns=['payment_type_id', 'internet_service_type_id', 'contract_type_id', 'customer_id'])
    
    # encode categorical columns with dummy variables
    
    categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)
    
    for col in categorical_columns:
        dummy_df = pd.get_dummies(df[col],
                                  prefix=df[col].name,
                                  drop_first=True,
                                  dummy_na=False)
        df = pd.concat([df, dummy_df], axis=1)
        df = df.drop(columns=col)
        
    return df

# Exploratory Analysis - Exercises Part I

## Section 1 - Iris Data

### 1. Acquire, Prepare and Split your data

#### Acquire:

In [None]:
iris = acquire.get_iris_data()

#### Prepare:

In [None]:
iris = prepare.prep_iris(iris)

#### Split:

In [None]:
train, test = train_test_split(iris, test_size=.2, random_state=123, stratify=iris.species)
train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train.species)

In [None]:
train.shape

In [None]:
test.shape

In [None]:
validate.shape

In [None]:
train.head(3)

In [None]:
train.info()

### 2. Univariate Stats

#### For each measurement type (quantitative variable): create a histogram, boxplot, and compute descriptive statistics (using.describe())

In [None]:
# identify the columns that contain data for quantitative variables
quantitative_columns = [col for col in train.columns[train.dtypes == 'float64']]

# for each of those columns:
for col in quantitative_columns:
        
        # display a histogram of that column's distribution
        sns.histplot(train[col], stat='proportion')
        plt.show()
        
        # display a boxplot of that column's distribution
        sns.boxplot(train[col])
        plt.show()
        
        # display the summary statistics
        print(pd.DataFrame(train[col].describe()))

#### For each species (categorical variable): create a frequency table and a bar plot of those frequencies.


In [None]:
# identify the columns that contain data for categorical variables
categorical_columns = ['species']

# for each of those columns
for col in categorical_columns:
    
    # display a frequency table
    print(pd.DataFrame(train[col].value_counts())
          .rename(columns={col: f'{col}_counts'}))
    
    # display a bar plot of those frequencies
    sns.countplot(data=train,
                  x=col)
    plt.title(f'{col}_counts')
    plt.show()

#### Document takeaways and any actions

### 3. Bivariate Stats

#### Visualize each measurement type (y-axis) with the species variable (x-axis) using barplots, adding a horizontal line showing the overall mean of the metric (y-axis). 

In [None]:
target = 'species'
quantitative_columns = [col for col in train.columns[train.dtypes == 'float64']]

for col in quantitative_columns:
    sns.barplot(data=train,
                x=target,
                y=col)
    plt.axhline(train[col].mean(), 
                ls='--', 
                color='black')
    plt.xlabel(None)
    plt.title(col, fontsize=14)
    plt.show()

#### For each measurement type, compute the descriptive statistics for each species. 

In [None]:
quantitative_columns = [col for col in train.columns[train.dtypes == 'float64']]
target = 'species'
line_break = ('-' * 62)

for col in quantitative_columns:
    print(col)
    print(train.groupby(by=target)[col].describe())   
    print(line_break)

#### For virginica & versicolor: Compare the mean petal_width using the Mann-Whitney test (scipy.stats.mannwhitneyu) to see if there is a significant difference between the two groups. Do the same for the other measurement types.


In [None]:
# get the data
train_virginica = train[train.species == 'virginica']
train_versicolor = train[train.species == 'versicolor']

# identify the columns on which to conduct the test (those that contain data for quantitative variables)
quantitative_columns = [col for col in train.columns[train.dtypes == 'float64']]

# for each of the columns with quantitative variables:
for col in quantitative_columns:
    
    # establish hypothesis and alpha level
    H0 = f'mean {col} for train_virginica flowers = mean {col} for train_versicolor flowers'
    H1 = f'mean {col} for train_virginica flowers != mean {col} for train_versicolor flowers'
    alpha = .05

    # conduct the test
    u, p = stats.mannwhitneyu(train_virginica[col], train_versicolor[col])
    
    # display test info and results
    line_break = ('\n' + '=' * 102 + '\n')
    print(f'MANN-WHITNEY U TEST FOR: {col.upper()}')
    print()
    print(f'H0: {H0}')
    print(f'H1: {H1}')
    print()
    print(f'u = {u}')
    print(f'p = {p.round(4)}')
    print()
    if p < alpha:
        print('RESULT: Reject H0\n')
        print(f'Since p < alpha:\n')
        print(f'we reject the null hypothesis that:\n\n\t{H0}\n')
        print(f'and we proceed under the assumption that:\n\n\t{H1}')
    elif p > alpha:
        print('RESULT: Fail to Reject H0\n')
        print(f'Since p > alpha:\n')
        print(f'we fail to reject the null hypothesis\n')
        print(f'and we proceed under the assumption that:\n\n\t{H0}')
    print(line_break)

#### Document takeaways and any actions

### 4. Multivariate Stats

#### Visualize the interaction of each measurement type with the others using a pairplot (or scatter matrix or something similar) and add color to represent species.


In [None]:
g = sns.PairGrid(train.drop(columns=['virginica', 'versicolor']), hue='species')
g.map_upper(sns.scatterplot)
g.map_lower(sns.kdeplot)
g.map_diag(sns.kdeplot, legend=False)
plt.show()

In [None]:
sns.heatmap(train.drop(columns=['versicolor', 'virginica']).corr().abs(), annot=True)
plt.show()

#### Visualize two numeric variables by means of the species. Hint: sns.relplot with hue or col


In [None]:
sns.relplot(x='petal_length',
            y='petal_width',
            data=train,
            hue='species')
plt.show()

#### Create a swarmplot using a melted dataframe of all your numeric variables. The x-axis should be the variable name, the y-axis the measure. Add another dimension using color to represent species. Document takeaways from this visualization.


In [None]:
train_melt = train.drop(columns=['versicolor', 'virginica']).melt(id_vars='species')

sns.swarmplot(data=train_melt,
              x='variable',
              y='value',
              hue='species')
plt.xlabel(None)
plt.show()

#### Ask a specific question of the data, such as: is the sepal area signficantly different in virginica compared to setosa? Answer the question through both a plot and using a mann-whitney or t-test. If you use a t-test, be sure assumptions are met (independence, normality, equal variance).


Is the petal area significantly different in virginica compared to setosa?

In [None]:
# get the data

train['petal_area'] = train.petal_width * train.petal_length

train_virginica = train[train.species == 'virginica']
train_setosa = train[train.species == 'setosa']


In [None]:
# establish hypotheses

H0 = 'mean of petal_area for virginica flowers = mean of petal_area for setosa flowers'
H1 = 'mean of petal_area for virginica flowers != mean of petal_area for setosa flowers'


In [None]:
# validate assumptions - independence
# yes, they are independent (no reason to think they're not independent)

In [None]:
# validate assumptions - variance
print('virginica variance: ', round(train_virginica.petal_area.var(), 2))
print('setosa variance: ', round(train_setosa.petal_area.var(), 2))

stat, p = stats.levene(train_virginica.petal_area, train_setosa.petal_area)
print('levene test p-value: ', p)

# they do not have equal variances

In [None]:
# validate assumptions - normality

plt.hist(train_virginica.petal_area)
plt.title(f'virginica petal area n = {train_virginica.shape[0]}')
plt.show()

plt.hist(train_setosa.petal_area)
plt.title(f'setosa petal area n = {train_setosa.shape[0]}')
plt.show()

# virginica petal area appears approximately normal, with the bulk of observations 
# concentrated in the middle values, and tailedness on each end. 

# however, setosa appears heavily skewed to the right

# in addition, we only have 28 observations for each category

# therefore, we will not assume normality for testing purposes

In [None]:
# conduct the test - Mann Whitney U
u, p = stats.mannwhitneyu(train_virginica.petal_area, train_setosa.petal_area)

# display test info and results
line_break = ('\n' + '=' * 102 + '\n')
print(f'MANN-WHITNEY U TEST FOR: {col.upper()}')
print()
print(f'H0: {H0}')
print(f'H1: {H1}')
print()
print(f'u = {u}')
print(f'p = {p.round(4)}')
print()
if p < alpha:
    print('RESULT: Reject H0\n')
    print(f'Since p < alpha:\n')
    print(f'we reject the null hypothesis that:\n\n\t{H0}\n')
    print(f'and we proceed under the assumption that:\n\n\t{H1}')
elif p > alpha:
    print('RESULT: Fail to Reject H0\n')
    print(f'Since p > alpha:\n')
    print(f'we fail to reject the null hypothesis\n')
    print(f'and we proceed under the assumption that:\n\n\t{H0}')
print(line_break)


#### Document takeaways and any actions.


# Exploratory Analysis - Exercises Part II

### Explore your titanic dataset more completely.

In [None]:
titanic = acquire.get_titanic_data()

In [None]:
titanic = prepare.prep_titanic(titanic, drop_after_encoding=False)

In [None]:
train, test = train_test_split(titanic, test_size=.2, random_state=123, stratify=titanic.survived)

In [None]:
train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train.survived)

#### Determine drivers of the target variable

In [None]:
train.shape

In [None]:
train.info()

In [None]:
explore.display_uniques_1(train)

In [None]:
train.head(3)

Univariate Stat Exploration:

In [None]:
# identify the columns that contain data for quantitative variables
quantitative_columns = ['fare', 'n_sibs_and_spouse', 'n_parents_and_children', 'family_size',]

# for each of those columns:
for col in quantitative_columns:
        
        # display a histogram of that column's distribution
        sns.histplot(train[col], stat='proportion')
        plt.show()
        
        # display a boxplot of that column's distribution
        sns.boxplot(train[col])
        plt.show()
        
        # display the summary statistics
        print(pd.DataFrame(train[col].describe()))

In [None]:
train.head()

In [None]:
# identify the columns that contain data for categorical variables
categorical_columns = ['pclass', 'sex', 'alone', 'embark_town']

# for each of those columns
for col in categorical_columns:
    
    # display a frequency table
    print(pd.DataFrame(train[col].value_counts())
          .rename(columns={col: f'{col} counts'}))
          
    # display a bar plot of those frequencies
    sns.countplot(data=train,
                  x=col,)
    plt.title(f'{col} counts')
    plt.xlabel(None)
    plt.show()

####     Determine if certain columns should be dropped


####     Determine if it would be valuable to bin some numeric columns


####     Determine if it would be valuable to combine multiple columns into one.


#### Does it make sense to combine any features?


#### Do you find any surprises?


#### Document any and all findings and takeaways in your notebook using markdown.