<a href="https://colab.research.google.com/github/bspychalski/machine-learning-bootacamp/blob/main/supervised/01_basics/03_feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
import sklearn

sklearn.__version__


'1.6.1'

In [3]:
def fetch_financial_data(company = 'AMZN'):
  """
  This function fetches stock market quotations
  """
  import pandas_datareader.data as web
  return web.DataReader(name = company, data_source='stooq')

df_raw = fetch_financial_data()
df_raw.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-04-04,167.145,178.1436,166.0,171.0,123159359
2025-04-03,182.995,184.13,176.92,178.41,95553617
2025-04-02,187.66,198.34,187.66,196.01,53679198
2025-04-01,187.86,193.93,187.2,192.17,41267315
2025-03-31,188.19,191.33,184.4,190.26,63547558


In [4]:
df = df_raw.copy()
df = df[:5]
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5 entries, 2025-04-04 to 2025-03-31
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Open    5 non-null      float64
 1   High    5 non-null      float64
 2   Low     5 non-null      float64
 3   Close   5 non-null      float64
 4   Volume  5 non-null      int64  
dtypes: float64(4), int64(1)
memory usage: 240.0 bytes


In [5]:
df.index

DatetimeIndex(['2025-04-04', '2025-04-03', '2025-04-02', '2025-04-01',
               '2025-03-31'],
              dtype='datetime64[ns]', name='Date', freq=None)

In [5]:
df['day'] = df.index.day
df['month'] = df.index.month
df['year'] = df.index.year

df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,day,month,year
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2025-04-04,167.145,178.1436,166.0,171.0,123159359,4,4,2025
2025-04-03,182.995,184.13,176.92,178.41,95553617,3,4,2025
2025-04-02,187.66,198.34,187.66,196.01,53679198,2,4,2025
2025-04-01,187.86,193.93,187.2,192.17,41267315,1,4,2025
2025-03-31,188.19,191.33,184.4,190.26,63547558,31,3,2025


In [8]:
df = pd.DataFrame(data = {'height': [175., 185.5, 185., 191., 184.5, 183, 168.]})
df

Unnamed: 0,height
0,175.0
1,185.5
2,185.0
3,191.0
4,184.5
5,183.0
6,168.0


In [8]:
df['height_cat'] = pd.cut(x = df.height, bins=3)
df

Unnamed: 0,height,height_cat
0,175.0,"(167.977, 175.667]"
1,185.5,"(183.333, 191.0]"
2,185.0,"(183.333, 191.0]"
3,191.0,"(183.333, 191.0]"
4,184.5,"(183.333, 191.0]"
5,183.0,"(175.667, 183.333]"
6,168.0,"(167.977, 175.667]"


In [11]:
df['height_cat'] = pd.cut(x = df.height, bins = (160, 175, 180, 195), labels=['small', 'medium', 'high'])
df

Unnamed: 0,height,height_cat
0,175.0,small
1,185.5,high
2,185.0,high
3,191.0,high
4,184.5,high
5,183.0,high
6,168.0,small


In [15]:
pd.get_dummies(data = df, drop_first=True, prefix='height', dtype=int)

Unnamed: 0,height,height_medium,height_high
0,175.0,0,0
1,185.5,0,1
2,185.0,0,1
3,191.0,0,1
4,184.5,0,1
5,183.0,0,1
6,168.0,0,0


In [18]:
df = pd.DataFrame(data = {'lang': [['PL', 'ENG'], ['GER', 'ENG', 'PL', 'FRA'], ['RUS']]})
df

Unnamed: 0,lang
0,"[PL, ENG]"
1,"[GER, ENG, PL, FRA]"
2,[RUS]


In [19]:
df['lang_number'] = df['lang'].apply(len)
df

Unnamed: 0,lang,lang_number
0,"[PL, ENG]",2
1,"[GER, ENG, PL, FRA]",4
2,[RUS],1


In [22]:
df['pl_lang'] = df['lang'].apply(lambda x: 1 if 'PL' in x else 0)
df

Unnamed: 0,lang,lang_number,pl_lang
0,"[PL, ENG]",2,1
1,"[GER, ENG, PL, FRA]",4,1
2,[RUS],1,0


In [23]:
df = pd.DataFrame(data = {'website': ['wp.pl', 'google.com', 'onet.pl']})
df

Unnamed: 0,website
0,wp.pl
1,google.com
2,onet.pl


In [26]:
new = df.website.str.split('.', expand=True)
df['portal'] = new[0]
df['extention'] = new[1]

df

Unnamed: 0,website,portal,extention
0,wp.pl,wp,pl
1,google.com,google,com
2,onet.pl,onet,pl
