### Penguen Dataseti Ödevi

In [53]:
# Datasetini içeri aktarma. Üzerinde genel bakış ve fikirler edinme.
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("penguins.csv")
display(df.head()) # Dataseti ilk 5 satiri

Unnamed: 0,id,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,3,Adelie,Torgersen,,,,,,2007
4,4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


In [54]:
df.describe() 

Unnamed: 0,id,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,year
count,344.0,342.0,342.0,342.0,342.0,344.0
mean,171.5,43.92193,17.15117,200.915205,4201.754386,2008.02907
std,99.448479,5.459584,1.974793,14.061714,801.954536,0.818356
min,0.0,32.1,13.1,172.0,2700.0,2007.0
25%,85.75,39.225,15.6,190.0,3550.0,2007.0
50%,171.5,44.45,17.3,197.0,4050.0,2008.0
75%,257.25,48.5,18.7,213.0,4750.0,2009.0
max,343.0,59.6,21.5,231.0,6300.0,2009.0


In [55]:
df.info(), # Datatypelari hakkinda bilgi verir.
df.nunique() # 3 species oldugunu ve bunlarin 3 adadan, 3 farkli yilda alindigini gosterir 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 344 non-null    int64  
 1   species            344 non-null    object 
 2   island             344 non-null    object 
 3   bill_length_mm     342 non-null    float64
 4   bill_depth_mm      342 non-null    float64
 5   flipper_length_mm  342 non-null    float64
 6   body_mass_g        342 non-null    float64
 7   sex                333 non-null    object 
 8   year               344 non-null    int64  
dtypes: float64(4), int64(2), object(3)
memory usage: 24.3+ KB


id                   344
species                3
island                 3
bill_length_mm       164
bill_depth_mm         80
flipper_length_mm     55
body_mass_g           94
sex                    2
year                   3
dtype: int64

In [56]:
df.species.unique(), df.island.unique(), df.year.unique() # Onemli degerlere kisa bir goz atabiliriz.

(array(['Adelie', 'Gentoo', 'Chinstrap'], dtype=object),
 array(['Torgersen', 'Biscoe', 'Dream'], dtype=object),
 array([2007, 2008, 2009]))

In [57]:
df.isnull().sum()

id                    0
species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
year                  0
dtype: int64

In [61]:
# Verimizi kullanabilir hale getirmek için:
# Cinsiyetin datatype`ını objectten inte cevirmemiz lazim, NaN değerleri ya silmeli ya da düzeltmeliyiz. 
df['sex'] = df['sex'].fillna(df['sex'].mode()[0]) # En cok tekrar eden degeri nan degerlere atadik
df['sex'] = df['sex'].map({'male':0, 'female':1})  # object datatype boola cevrildi.

In [62]:
# Numerik verilerin icinde kayip verileri ise ortalama veriyle doldurur.
df[['bill_length_mm','bill_depth_mm','flipper_length_mm','body_mass_g']] = df[['bill_length_mm','bill_depth_mm','flipper_length_mm','body_mass_g']].fillna(df[['bill_length_mm','bill_depth_mm','flipper_length_mm','body_mass_g']].mean())

In [63]:
# Herhangi bir sorun olup olmadigini kontrol eder.
df.isnull().sum()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 344 non-null    int64  
 1   species            344 non-null    object 
 2   island             344 non-null    object 
 3   bill_length_mm     344 non-null    float64
 4   bill_depth_mm      344 non-null    float64
 5   flipper_length_mm  344 non-null    float64
 6   body_mass_g        344 non-null    float64
 7   sex                344 non-null    int64  
 8   year               344 non-null    int64  
dtypes: float64(4), int64(3), object(2)
memory usage: 24.3+ KB


In [64]:
# Label ve Feature columnlarini ayirmaliyiz.
features = df.drop(labels='species',axis=1)
label = df.species

In [65]:
display(features), display(label)

Unnamed: 0,id,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,0,Torgersen,39.10000,18.70000,181.000000,3750.000000,0,2007
1,1,Torgersen,39.50000,17.40000,186.000000,3800.000000,1,2007
2,2,Torgersen,40.30000,18.00000,195.000000,3250.000000,1,2007
3,3,Torgersen,43.92193,17.15117,200.915205,4201.754386,0,2007
4,4,Torgersen,36.70000,19.30000,193.000000,3450.000000,1,2007
...,...,...,...,...,...,...,...,...
339,339,Dream,55.80000,19.80000,207.000000,4000.000000,0,2009
340,340,Dream,43.50000,18.10000,202.000000,3400.000000,1,2009
341,341,Dream,49.60000,18.20000,193.000000,3775.000000,0,2009
342,342,Dream,50.80000,19.00000,210.000000,4100.000000,0,2009


0         Adelie
1         Adelie
2         Adelie
3         Adelie
4         Adelie
         ...    
339    Chinstrap
340    Chinstrap
341    Chinstrap
342    Chinstrap
343    Chinstrap
Name: species, Length: 344, dtype: object

(None, None)

In [66]:
x_train, x_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=42)

In [67]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((275, 8), (69, 8), (275,), (69,))