In [39]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, QuantileTransformer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from IPython.display import Image

In [14]:
data = pd.read_csv('data/penguins_size.csv')
data.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   culmen_length_mm   342 non-null    float64
 3   culmen_depth_mm    342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                334 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [16]:
data.isnull().sum()

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64

# Drop data

In [None]:
data = data.dropna()
data.head()

# Fill empty data with 0

In [17]:
data['culmen_length_mm'].fillna((data['culmen_length_mm'].mean()), inplace=True)
data['culmen_depth_mm'].fillna((data['culmen_depth_mm'].median()), inplace=True)
data['flipper_length_mm'].fillna((data['flipper_length_mm'].mean()), inplace=True)
data['body_mass_g'].fillna((data['body_mass_g'].mean()), inplace=True)

data['sex'].fillna((data['sex'].value_counts().index[0]), inplace=True)

data.reset_index()
data.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,43.92193,17.3,200.915205,4201.754386,MALE
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [18]:
data.isnull().sum()

species              0
island               0
culmen_length_mm     0
culmen_depth_mm      0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

# Update a specific row 

In [21]:
data.loc[(data['sex'] != 'FEMALE') & (data['sex'] != 'MALE')]

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
336,Gentoo,Biscoe,44.5,15.7,217.0,4875.0,.


In [22]:
data = data.drop([336])
data.reset_index()

Unnamed: 0,index,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,0,Adelie,Torgersen,39.10000,18.7,181.000000,3750.000000,MALE
1,1,Adelie,Torgersen,39.50000,17.4,186.000000,3800.000000,FEMALE
2,2,Adelie,Torgersen,40.30000,18.0,195.000000,3250.000000,FEMALE
3,3,Adelie,Torgersen,43.92193,17.3,200.915205,4201.754386,MALE
4,4,Adelie,Torgersen,36.70000,19.3,193.000000,3450.000000,FEMALE
...,...,...,...,...,...,...,...,...
338,339,Gentoo,Biscoe,43.92193,17.3,200.915205,4201.754386,MALE
339,340,Gentoo,Biscoe,46.80000,14.3,215.000000,4850.000000,FEMALE
340,341,Gentoo,Biscoe,50.40000,15.7,222.000000,5750.000000,MALE
341,342,Gentoo,Biscoe,45.20000,14.8,212.000000,5200.000000,FEMALE


# Category Encoding

In [23]:
data['species'] = data['species'].astype('category')
data['island']  = data['island'].astype('category')
data['sex']     = data['sex'].astype('category')
data.dtypes

species              category
island               category
culmen_length_mm      float64
culmen_depth_mm       float64
flipper_length_mm     float64
body_mass_g           float64
sex                  category
dtype: object

In [25]:
categorical_data = data.drop(['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g'], axis=1)
categorical_data.head()

Unnamed: 0,species,island,sex
0,Adelie,Torgersen,MALE
1,Adelie,Torgersen,FEMALE
2,Adelie,Torgersen,FEMALE
3,Adelie,Torgersen,MALE
4,Adelie,Torgersen,FEMALE


# Label Encoding

In [26]:
categorical_data['species'] = categorical_data['species'].cat.codes
categorical_data['island']  = categorical_data['island'].cat.codes
categorical_data['sex']     = categorical_data['sex'].cat.codes
categorical_data.head()

Unnamed: 0,species,island,sex
0,0,2,1
1,0,2,0
2,0,2,0
3,0,2,1
4,0,2,0


# One Hot Encoding

In [30]:
# encoded_spicies = pd.get_dummies(categorical_data['species'])
# encoded_island = pd.get_dummies(categorical_data['island'])
# encoded_sex = pd.get_dummies(categorical_data['sex'])

# categorical_data = categorical_data.join(encoded_spicies)
# categorical_data = categorical_data.join(encoded_island)
# categorical_data = categorical_data.join(encoded_sex)
# categorical_data.head()

# Count Encoding

In [31]:
categorical_data = data.drop(['culmen_length_mm', 'culmen_depth_mm','flipper_length_mm', 'body_mass_g'], axis=1)

species_count = categorical_data['species'].value_counts()
island_count  = categorical_data['island'].value_counts()
sex_count     = categorical_data['sex'].value_counts()

categorical_data['species_count_enc'] = categorical_data['species'].map(species_count)
categorical_data['island_count_enc']  = categorical_data['island'].map(island_count)
categorical_data['sex_count_enc']     = categorical_data['sex'].map(sex_count)

categorical_data.head()

Unnamed: 0,species,island,sex,species_count_enc,island_count_enc,sex_count_enc
0,Adelie,Torgersen,MALE,152,52,178
1,Adelie,Torgersen,FEMALE,152,52,165
2,Adelie,Torgersen,FEMALE,152,52,165
3,Adelie,Torgersen,MALE,152,52,178
4,Adelie,Torgersen,FEMALE,152,52,165


# Target Encoding

In [36]:
categorical_data['species'] = categorical_data['species'].cat.codes
island_means = categorical_data.groupby('island')['species'].mean()
sex_means    = categorical_data.groupby('sex')['species'].mean()

AttributeError: Can only use .cat accessor with a 'category' dtype

In [37]:
# Target 
categorical_data["species"] 

0      0
1      0
2      0
3      0
4      0
      ..
339    2
340    2
341    2
342    2
343    2
Name: species, Length: 343, dtype: int8

# Leave One Out Target Encoding

In [38]:
def leave_one_out_mean(series):
    series = (series.sum() - series)/(len(series) - 1)
    return series

# Handling Outlier

# Date time

In [40]:
data = pd.read_csv("data/datedata.csv")
data.head()
df=data
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [41]:
df["ScheduledDay"]   = pd.to_datetime(df["ScheduledDay"], format = "%Y-%m-%dT%H:%M:%SZ",errors ="coerce")
df["AppointmentDay"] = pd.to_datetime(df["AppointmentDay"], format = "%Y-%m-%dT%H:%M:%SZ",errors ="coerce")

df["ScheduledDay_year"]  = df["ScheduledDay"].dt.year
df["ScheduledDay_month"] = df["ScheduledDay"].dt.month
df["ScheduledDay_week"]  = df["ScheduledDay"].dt.week
df["ScheduledDay_day"]   = df["ScheduledDay"].dt.day
df["ScheduledDay_hour"]  = df["ScheduledDay"].dt.hour
df["ScheduledDay_minute"]    = df["ScheduledDay"].dt.minute
df["ScheduledDay_dayofweek"] = df["ScheduledDay"].dt.dayofweek

df.head()

  df["ScheduledDay_week"]  = df["ScheduledDay"].dt.week


Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,...,Handcap,SMS_received,No-show,ScheduledDay_year,ScheduledDay_month,ScheduledDay_week,ScheduledDay_day,ScheduledDay_hour,ScheduledDay_minute,ScheduledDay_dayofweek
0,29872500000000.0,5642903,F,2016-04-29 18:38:08,2016-04-29,62,JARDIM DA PENHA,0,1,0,...,0,0,No,2016,4,17,29,18,38,4
1,558997800000000.0,5642503,M,2016-04-29 16:08:27,2016-04-29,56,JARDIM DA PENHA,0,0,0,...,0,0,No,2016,4,17,29,16,8,4
2,4262962000000.0,5642549,F,2016-04-29 16:19:04,2016-04-29,62,MATA DA PRAIA,0,0,0,...,0,0,No,2016,4,17,29,16,19,4
3,867951200000.0,5642828,F,2016-04-29 17:29:31,2016-04-29,8,PONTAL DE CAMBURI,0,0,0,...,0,0,No,2016,4,17,29,17,29,4
4,8841186000000.0,5642494,F,2016-04-29 16:07:23,2016-04-29,56,JARDIM DA PENHA,0,1,1,...,0,0,No,2016,4,17,29,16,7,4
