In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pydataset import data

### 1. Create a pandas dataframe, `df_iris`, from the iris data.

In [None]:
df_iris = data('iris')
df_iris.info()

In [None]:
# Print the first 3 rows

df_iris.iloc[0:3]

In [None]:
# print the number of rows and columns(shape)

df_iris.shape

In [None]:
# print the column names

df_iris.columns

In [None]:
# print the summary statistics for each of the numeric variables. 
# Would you recommend rescaling the data based on these statistics?
# No, no need to rescale the data, since the four variables are in the same scale. 

df_iris.describe()

### 2. Read the `Table1_CustDetails` table from the `Excel_Exercises.xlsx` file into a dataframe named `df_excel`

In [None]:
df_excel = pd.read_excel('/Users/mms3-pro/codeup-data-science/Spreadsheets_Exercises.xlsx')
df_excel.info()

In [None]:
# assign the first 100 rows to a new dataframe

df_excel_sample = df_excel.head(100)
df_excel_sample.info()

In [None]:
# print the numbers of rows of your original dataframe

df_excel.shape[0]

In [None]:
# print first 5 column names

df_excel_sample.columns[0:5]

In [None]:
# print the column names that have a data type of object

column_dtype = df_excel_sample.dtypes
column_dtype[column_dtype == 'object']

In [None]:
# compute the range for each of the numeric varibales

a, b = df_excel.monthly_charges.agg(['min', 'max'])
print(f'The range of monthly charges is from {a} to {b}.')

c, d = df_excel.total_charges.agg(['min', 'max'])
print(f'The range of total charges is from {c} to {d}.')

### 3. Read the data from `this google sheet` into a dataframe, df_google

In [None]:
G_sheet_url = 'https://docs.google.com/spreadsheets/d/1Uhtml8KY19LILuZsrDtlsHHDC9wuDGUSe8LTEwvdI5g/edit#gid=341089357'
csv_export_url = G_sheet_url.replace('/edit#gid=', '/export?format=csv&gid=')

df_google = pd.read_csv(csv_export_url)
df_google.info()

In [None]:
# print first 3 rows

df_google.head(3)

In [None]:
# print the number of rows and columns

df_google.shape

In [None]:
# print column names

df_google.columns

In [None]:
# print data type of each column

df_google.dtypes

In [None]:
# print the summary statistics for each of the numeric variables

df_google.describe()

In [None]:
# print the unique values for each of your categorical variables

df_google.nunique()

In [None]:
print(f'The unique values for Sex: {df_google.Sex.unique()}')
print(f'The unique values for Survived: {df_google.Survived.unique()}')
print(f'The unique values for Pclass: {df_google.Pclass.unique()}')
print(f'The unique values for Embarked: {df_google.Embarked.unique()}')

## Data Preparation Exercises

### 1-a. Use the function defined in `acqure.py` to load the iris data.

In [2]:
import os
from acquire import get_iris_data

iris = get_iris_data()
iris.head()

Unnamed: 0,species_id,measurement_id,sepal_length,sepal_width,petal_length,petal_width,species_name
0,1,1,5.1,3.5,1.4,0.2,setosa
1,1,2,4.9,3.0,1.4,0.2,setosa
2,1,3,4.7,3.2,1.3,0.2,setosa
3,1,4,4.6,3.1,1.5,0.2,setosa
4,1,5,5.0,3.6,1.4,0.2,setosa


### 1-b. Drop the `species_id` and `measurement_id` columns

In [None]:
iris.drop(columns=['species_id','measurement_id'], inplace=True)
iris.head()

### 1-c. Rename the `species_name` column to just `species`

In [None]:
iris.rename(columns={'species_name':'species'}, inplace=True)
iris.head()

### 1-d. Create dummy variables of the species name

In [None]:
# select the columns you would like to be converted and make sure the returned is a dataframe

iris[['species']] # Why using double square bracket?

# Do the conversion by pd.get_dummies()

species_dummy = pd.get_dummies(iris[['species']])

# join the dummy vars with the original data frame

iris = pd.concat([iris, species_dummy], axis=1)
iris.sample(5)

### 1-e. Create a function named `prep_iris` that accepts the untransformed iris data, and returns the data with the transformations above applies. 

In [5]:
def prep_iris(iris):
    iris.drop(columns=['species_id','measurement_id'], inplace=True)
    iris.rename(columns={'species_name':'species'}, inplace=True)
    species_dummy = pd.get_dummies(iris[['species']])
    iris = pd.concat([iris, species_dummy], axis=1)
    return iris

In [3]:
iris.head()

Unnamed: 0,species_id,measurement_id,sepal_length,sepal_width,petal_length,petal_width,species_name
0,1,1,5.1,3.5,1.4,0.2,setosa
1,1,2,4.9,3.0,1.4,0.2,setosa
2,1,3,4.7,3.2,1.3,0.2,setosa
3,1,4,4.6,3.1,1.5,0.2,setosa
4,1,5,5.0,3.6,1.4,0.2,setosa


In [6]:
prep_iris(iris).head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_setosa,species_versicolor,species_virginica
0,5.1,3.5,1.4,0.2,setosa,1,0,0
1,4.9,3.0,1.4,0.2,setosa,1,0,0
2,4.7,3.2,1.3,0.2,setosa,1,0,0
3,4.6,3.1,1.5,0.2,setosa,1,0,0
4,5.0,3.6,1.4,0.2,setosa,1,0,0


### 2-a. Use the function you defined in `acquire.py` to load the titanic data set. 

In [8]:
from acquire import get_titanic_data

titanic = get_titanic_data()
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


### 2-b. Handle the missing values in the `embark_town` and `embarked` column. 

In [14]:
# How many values are missing in the two columns?

titanic.embark_town.isnull().sum(), titanic.embarked.isnull().sum()

(2, 2)

In [18]:
# What are the unique values in the two columns?

titanic.embark_town.value_counts(dropna=False), titanic.embarked.value_counts(dropna=False)

(Southampton    644
 Cherbourg      168
 Queenstown      77
 NaN              2
 Name: embark_town, dtype: int64,
 S      644
 C      168
 Q       77
 NaN      2
 Name: embarked, dtype: int64)

In [28]:
# Drop the two rows.

drop_index = titanic[titanic.embarked.isnull()].index
titanic.drop(index=drop_index, inplace=True)

In [30]:
titanic.embark_town.value_counts(dropna=False), titanic.embarked.value_counts(dropna=False)

(Southampton    644
 Cherbourg      168
 Queenstown      77
 Name: embark_town, dtype: int64,
 S    644
 C    168
 Q     77
 Name: embarked, dtype: int64)

### 2-c. Remove the `deck` column

In [34]:
titanic.drop(columns='deck', inplace=True)

In [38]:
titanic.columns

Index(['passenger_id', 'survived', 'pclass', 'sex', 'age', 'sibsp', 'parch',
       'fare', 'embarked', 'class', 'embark_town', 'alone'],
      dtype='object')

### 2-d. Create a dummy variable of the `embarked` column

In [47]:
embarked_dummies = pd.get_dummies(titanic[['embarked']])
embarked_dummies.head()

Unnamed: 0,embarked_C,embarked_Q,embarked_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [50]:
titanic = pd.concat([titanic,embarked_dummies], axis=1)
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,embarked_C,embarked_Q,embarked_S
0,0,0,3,male,22.0,1,0,7.25,S,Third,Southampton,0,0,0,1
1,1,1,1,female,38.0,1,0,71.2833,C,First,Cherbourg,0,1,0,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,Southampton,1,0,0,1
3,3,1,1,female,35.0,1,0,53.1,S,First,Southampton,0,0,0,1
4,4,0,3,male,35.0,0,0,8.05,S,Third,Southampton,1,0,0,1


- Two values are missing in the two columns. It is a very small fraction. 