<a href="https://colab.research.google.com/github/adrianciemerych/machine-learning/blob/main/ml_course/preprocessing/05_feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import sklearn

sklearn.__version__

'1.2.2'

In [None]:
# Załadowanie danych

def fetch_financial_data(company = 'AMZN'):
  import pandas_datareader.data as web
  return web.DataReader(name = company, data_source = 'stooq')

df_raw = fetch_financial_data()
df_raw.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-05-09,105.48,106.79,105.1567,106.62,43992847
2023-05-08,105.04,106.095,104.7001,105.83,49430909
2023-05-05,104.27,105.76,103.55,105.655,56951744
2023-05-04,104.04,105.39,103.3117,104.0,45345523
2023-05-03,103.735,105.96,103.28,103.65,65051925


In [None]:
df = df_raw.copy()
df = df[:5]
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5 entries, 2023-05-09 to 2023-05-03
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Open    5 non-null      float64
 1   High    5 non-null      float64
 2   Low     5 non-null      float64
 3   Close   5 non-null      float64
 4   Volume  5 non-null      int64  
dtypes: float64(4), int64(1)
memory usage: 240.0 bytes


In [None]:
df['day'] = df.index.day
df['month'] = df.index.month
df['year'] = df.index.year
df['week_day'] = df.index.day_of_week
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,day,month,year,week_day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2023-05-09,105.48,106.79,105.1567,106.62,43992847,9,5,2023,1
2023-05-08,105.04,106.095,104.7001,105.83,49430909,8,5,2023,0
2023-05-05,104.27,105.76,103.55,105.655,56951744,5,5,2023,4
2023-05-04,104.04,105.39,103.3117,104.0,45345523,4,5,2023,3
2023-05-03,103.735,105.96,103.28,103.65,65051925,3,5,2023,2


In [None]:
df = pd.DataFrame({'height' : [175.0, 178.5, 185.0, 191.0, 184.0, 183.0, 168.0, 169.5]})
df

Unnamed: 0,height
0,175.0
1,178.5
2,185.0
3,191.0
4,184.0
5,183.0
6,168.0
7,169.5


In [None]:
# Dyskretyzacja zmiennej ciągłej

df['height_cat'] = pd.cut(x = df.height, bins = 3)
df

df['height_cat_manu'] = pd.cut(x = df.height, bins = (160,170,180,195))
df

df['height_names'] = pd.cut(x = df.height, bins = (160,170,180,195), labels = ['low', 'medium', 'high'])
df



Unnamed: 0,height,height_cat,height_cat_manu,height_names
0,175.0,"(167.977, 175.667]","(170, 180]",medium
1,178.5,"(175.667, 183.333]","(170, 180]",medium
2,185.0,"(183.333, 191.0]","(180, 195]",high
3,191.0,"(183.333, 191.0]","(180, 195]",high
4,184.0,"(183.333, 191.0]","(180, 195]",high
5,183.0,"(175.667, 183.333]","(180, 195]",high
6,168.0,"(167.977, 175.667]","(160, 170]",low
7,169.5,"(167.977, 175.667]","(160, 170]",low


In [None]:
df_dummies = df[['height', 'height_names']]
pd.get_dummies(data = df_dummies, drop_first = True, prefix = 'height')


Unnamed: 0,height,height_medium,height_high
0,175.0,1,0
1,178.5,1,0
2,185.0,0,1
3,191.0,0,1
4,184.0,0,1
5,183.0,0,1
6,168.0,0,0
7,169.5,0,0


In [2]:
df = pd.DataFrame(data = {'1ang' : [['PL', 'ENG'], ['GER', 'ENG', 'PL', 'FRA'], ['RUS']]})
df

Unnamed: 0,1ang
0,"[PL, ENG]"
1,"[GER, ENG, PL, FRA]"
2,[RUS]


In [4]:
# Funkcja, która pozwoli zastosować nam funkcję na każdym wierszu
df['lenght'] = df['1ang'].apply(len)
df

df['PL_flag'] = df['1ang'].apply(lambda x: 1 if 'PL' in x else 0)
df

Unnamed: 0,1ang,lenght,PL_flag
0,"[PL, ENG]",2,1
1,"[GER, ENG, PL, FRA]",4,1
2,[RUS],1,0


In [9]:
df = pd.DataFrame({'website' : ['wp.pl', 'onet.pl', 'google.com']})
df

new = df['website'].str.split(".", expand = True)
df['poratl'] = new[0]
df['extension'] = new[1]
df

Unnamed: 0,website,poratl,extension
0,wp.pl,wp,pl
1,onet.pl,onet,pl
2,google.com,google,com
