In [82]:
# import libraries
import numpy as np          
import pandas as pd
import seaborn as sns   
import matplotlib.pyplot as plt                       


In [83]:
# import and analyze the data
df = pd.read_csv('penguins.csv'
                 )


df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [84]:
# Does the data aontain NAs or any missing data
# What data types exist for analysis

df.dtypes
df.value_counts(dropna= False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [85]:
# Sex column seems to have more data missing than the rest of the columns
# Using sex column, drop all na to clean the data

df_clean = df.dropna(subset='sex')
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 333 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            333 non-null    object 
 1   island             333 non-null    object 
 2   bill_length_mm     333 non-null    float64
 3   bill_depth_mm      333 non-null    float64
 4   flipper_length_mm  333 non-null    float64
 5   body_mass_g        333 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 20.8+ KB


In [86]:
# how many species and islands exist in the columns
df_clean['species'].unique()
df_clean['species'].value_counts()

species
Adelie       146
Gentoo       119
Chinstrap     68
Name: count, dtype: int64

In [87]:
# use isin to call Adelie and Gentoo
agentoo = df_clean[df_clean['species'].isin(['Adelie','Gentoo'])]
agentoo.head()

agentoo.info()

<class 'pandas.core.frame.DataFrame'>
Index: 265 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            265 non-null    object 
 1   island             265 non-null    object 
 2   bill_length_mm     265 non-null    float64
 3   bill_depth_mm      265 non-null    float64
 4   flipper_length_mm  265 non-null    float64
 5   body_mass_g        265 non-null    float64
 6   sex                265 non-null    object 
dtypes: float64(4), object(3)
memory usage: 16.6+ KB


In [88]:
df_clean.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
count,333.0,333.0,333.0,333.0
mean,43.992793,17.164865,200.966967,4207.057057
std,5.468668,1.969235,14.015765,805.215802
min,32.1,13.1,172.0,2700.0
25%,39.5,15.6,190.0,3550.0
50%,44.5,17.3,197.0,4050.0
75%,48.6,18.7,213.0,4775.0
max,59.6,21.5,231.0,6300.0


In [89]:
# make a copy of agentoo and analyze
agentoo_copy = agentoo.copy()

agentoo_copy['bill_flipper_mm'] = agentoo_copy.apply(lambda row: row['flipper_length_mm'] + row['flipper_length_mm'], axis=1 )

agentoo_copy.head(
    
)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,bill_flipper_mm
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE,362.0
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE,372.0
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE,390.0
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE,386.0
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE,380.0


In [90]:
# group the data in the agentoo data by island
agentoo_group = agentoo_copy.groupby("island")[['bill_length_mm', 'bill_depth_mm', 'bill_flipper_mm']].sum()
agentoo_group

Unnamed: 0_level_0,bill_length_mm,bill_depth_mm,bill_flipper_mm
island,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Biscoe,7375.5,2592.9,68316.0
Dream,2118.6,1003.2,20892.0
Torgersen,1834.8,867.2,18004.0


In [91]:
agentoo_copy.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,bill_flipper_mm
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE,362.0
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE,372.0
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE,390.0
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE,386.0
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE,380.0


In [92]:
# use loc to find data
agentoo_loc = agentoo_copy.loc[agentoo_copy['bill_depth_mm'] >= 10]
agentoo_loc1 = agentoo_copy[agentoo_copy['bill_depth_mm'] >= 10]

print(agentoo_loc.shape)
print(agentoo_loc1.shape)

(265, 8)
(265, 8)


In [93]:
agentoo_copy.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,bill_flipper_mm
count,265.0,265.0,265.0,265.0,265.0
mean,42.750566,16.842642,202.286792,4328.679245,404.573585
std,5.214804,2.009775,15.018953,839.748875,30.037906
min,32.1,13.1,172.0,2850.0,344.0
25%,38.6,15.0,190.0,3600.0,380.0
50%,42.0,17.0,198.0,4300.0,396.0
75%,46.8,18.5,215.0,5000.0,430.0
max,59.6,21.5,231.0,6300.0,462.0


In [94]:
agentoo_copy.describe(include=['object']).round()

Unnamed: 0,species,island,sex
count,265,265,265
unique,2,3,2
top,Adelie,Biscoe,MALE
freq,146,163,134


In [95]:
agentoo_copy.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,bill_flipper_mm
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE,362.0
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE,372.0
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE,390.0
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE,386.0
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE,380.0


In [96]:
agentoo_drop = agentoo_copy.drop(columns=['body_mass_g', 'bill_depth_mm'])

agentoo_drop

Unnamed: 0,species,island,bill_length_mm,flipper_length_mm,sex,bill_flipper_mm
0,Adelie,Torgersen,39.1,181.0,MALE,362.0
1,Adelie,Torgersen,39.5,186.0,FEMALE,372.0
2,Adelie,Torgersen,40.3,195.0,FEMALE,390.0
4,Adelie,Torgersen,36.7,193.0,FEMALE,386.0
5,Adelie,Torgersen,39.3,190.0,MALE,380.0
...,...,...,...,...,...,...
338,Gentoo,Biscoe,47.2,214.0,FEMALE,428.0
340,Gentoo,Biscoe,46.8,215.0,FEMALE,430.0
341,Gentoo,Biscoe,50.4,222.0,MALE,444.0
342,Gentoo,Biscoe,45.2,212.0,FEMALE,424.0


In [97]:
agentoo_drop_1 = agentoo_copy.drop(['body_mass_g', 'bill_depth_mm'], axis=1, inplace=False)

print(agentoo_drop_1)

    species     island  bill_length_mm  flipper_length_mm     sex  \
0    Adelie  Torgersen            39.1              181.0    MALE   
1    Adelie  Torgersen            39.5              186.0  FEMALE   
2    Adelie  Torgersen            40.3              195.0  FEMALE   
4    Adelie  Torgersen            36.7              193.0  FEMALE   
5    Adelie  Torgersen            39.3              190.0    MALE   
..      ...        ...             ...                ...     ...   
338  Gentoo     Biscoe            47.2              214.0  FEMALE   
340  Gentoo     Biscoe            46.8              215.0  FEMALE   
341  Gentoo     Biscoe            50.4              222.0    MALE   
342  Gentoo     Biscoe            45.2              212.0  FEMALE   
343  Gentoo     Biscoe            49.9              213.0    MALE   

     bill_flipper_mm  
0              362.0  
1              372.0  
2              390.0  
4              386.0  
5              380.0  
..               ...  
338       

In [98]:
agentoo_drop_1.duplicated(subset='species')

0      False
1       True
2       True
4       True
5       True
       ...  
338     True
340     True
341     True
342     True
343     True
Length: 265, dtype: bool

In [99]:
drop_drop = agentoo_drop_1.drop_duplicates(subset='species',keep='last')
drop_drop.head()

Unnamed: 0,species,island,bill_length_mm,flipper_length_mm,sex,bill_flipper_mm
151,Adelie,Dream,41.5,201.0,MALE,402.0
343,Gentoo,Biscoe,49.9,213.0,MALE,426.0


In [100]:
agentoo_drop_1.head()

Unnamed: 0,species,island,bill_length_mm,flipper_length_mm,sex,bill_flipper_mm
0,Adelie,Torgersen,39.1,181.0,MALE,362.0
1,Adelie,Torgersen,39.5,186.0,FEMALE,372.0
2,Adelie,Torgersen,40.3,195.0,FEMALE,390.0
4,Adelie,Torgersen,36.7,193.0,FEMALE,386.0
5,Adelie,Torgersen,39.3,190.0,MALE,380.0


In [101]:
# rename column titles
agentoo_drop_1_rename = agentoo_drop_1.rename(columns={'species': "penguine_species",
                                                       'island': 'penguine_island',
                                                       'bill_length_mm' : 'bill_length',
                                                       'flipper_lenghth_mm':'flipper_length',
                                                       'sex': 'penguine_gender',
                                                       'bill_flipper_mm':'bill_flipper'
                                                    })

agentoo_drop_1_rename.head()

Unnamed: 0,penguine_species,penguine_island,bill_length,flipper_length_mm,penguine_gender,bill_flipper
0,Adelie,Torgersen,39.1,181.0,MALE,362.0
1,Adelie,Torgersen,39.5,186.0,FEMALE,372.0
2,Adelie,Torgersen,40.3,195.0,FEMALE,390.0
4,Adelie,Torgersen,36.7,193.0,FEMALE,386.0
5,Adelie,Torgersen,39.3,190.0,MALE,380.0


In [102]:
# use assign
agentoo_drop_1_rename.assign(
    new_flipper = agentoo_drop_1_rename['bill_flipper'] * 2,
    
    new_bill = lambda x: x['bill_length'] ** 2
)

Unnamed: 0,penguine_species,penguine_island,bill_length,flipper_length_mm,penguine_gender,bill_flipper,new_flipper,new_bill
0,Adelie,Torgersen,39.1,181.0,MALE,362.0,724.0,1528.81
1,Adelie,Torgersen,39.5,186.0,FEMALE,372.0,744.0,1560.25
2,Adelie,Torgersen,40.3,195.0,FEMALE,390.0,780.0,1624.09
4,Adelie,Torgersen,36.7,193.0,FEMALE,386.0,772.0,1346.89
5,Adelie,Torgersen,39.3,190.0,MALE,380.0,760.0,1544.49
...,...,...,...,...,...,...,...,...
338,Gentoo,Biscoe,47.2,214.0,FEMALE,428.0,856.0,2227.84
340,Gentoo,Biscoe,46.8,215.0,FEMALE,430.0,860.0,2190.24
341,Gentoo,Biscoe,50.4,222.0,MALE,444.0,888.0,2540.16
342,Gentoo,Biscoe,45.2,212.0,FEMALE,424.0,848.0,2043.04


In [103]:
print(agentoo_drop_1_rename['penguine_species'].dtypes)

# convert to category
cat1 = pd.Categorical(agentoo_drop_1_rename['penguine_species'])
print(cat1.dtype)

cat2 = agentoo_drop_1_rename['penguine_species'].astype('category')
print(cat2.dtype)

cat3 = agentoo_drop_1_rename.astype({'penguine_species':'category'},
                                    {'penguine_gender': 'category'})
print(cat3['penguine_species'].dtype)
print(cat3['penguine_species'].dtype)


object
category
category
category
category


In [104]:
agentoo_drop_1_rename.memory_usage(deep=True)

Index                 2120
penguine_species     16695
penguine_island      16781
bill_length           2120
flipper_length_mm     2120
penguine_gender      16427
bill_flipper          2120
dtype: int64

In [105]:
datax = np.arange(10).reshape(2,5)
print(datax)

[[0 1 2 3 4]
 [5 6 7 8 9]]


In [106]:
agentoo.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE


In [107]:
agentoo_query = agentoo.query('species == "Adelie"')
agentoo_query

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE
...,...,...,...,...,...,...,...
147,Adelie,Dream,36.6,18.4,184.0,3475.0,FEMALE
148,Adelie,Dream,36.0,17.8,195.0,3450.0,FEMALE
149,Adelie,Dream,37.8,18.1,193.0,3750.0,MALE
150,Adelie,Dream,36.0,17.1,187.0,3700.0,FEMALE


In [None]:
adelie_query1 = agentoo.query('species == "Adelie" and island == "Torgersen" and sex == "MALE" ')
adelie_query1

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE
7,Adelie,Torgersen,39.2,19.6,195.0,4675.0,MALE
13,Adelie,Torgersen,38.6,21.2,191.0,3800.0,MALE
14,Adelie,Torgersen,34.6,21.1,198.0,4400.0,MALE
17,Adelie,Torgersen,42.5,20.7,197.0,4500.0,MALE
19,Adelie,Torgersen,46.0,21.5,194.0,4200.0,MALE
69,Adelie,Torgersen,41.8,19.4,198.0,4450.0,MALE
71,Adelie,Torgersen,39.7,18.4,190.0,3900.0,MALE
73,Adelie,Torgersen,45.8,18.9,197.0,4150.0,MALE


agentoo_query = agentoo.query('species == "Adelie" and island == "Torgersen" ')
agentoo_query

In [129]:
# grocery list
grocery_list = ['orange', 'eggs', 'juice', 'paper_towel', 'sousage']
prices_list = [23, 34, 55, 20, 70]


new_tarrif = []

# for loop
for index, grocery in enumerate(prices_list):
    if grocery > 50:
        new_tarrif.append(grocery + 0.8)
    else:
        new_tarrif.append(grocery + 0.5)
    # grocery
    # print(index, grocery)
    
print(new_tarrif)
print("$" + str(new_tarrif))

[23.5, 34.5, 55.8, 20.5, 70.8]
$[23.5, 34.5, 55.8, 20.5, 70.8]


In [118]:
print(agentoo['bill_length_mm'])

0      39.1
1      39.5
2      40.3
4      36.7
5      39.3
       ... 
338    47.2
340    46.8
341    50.4
342    45.2
343    49.9
Name: bill_length_mm, Length: 265, dtype: float64


In [124]:
#lambda function 
agentoo['bogus'] = agentoo.apply(lambda x: x['bill_length_mm'] * 2, axis=1)
#agentoo['bogus'] =  agentoo['bill_length_mm'] *2

agentoo.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  agentoo['bogus'] = agentoo.apply(lambda x: x['bill_length_mm'] * 2, axis=1)


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,bogus
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE,78.2
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE,79.0
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE,80.6
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE,73.4
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE,78.6
