In [1]:
import pandas as pd
import numpy as np

retail_df = pd.read_csv(
    "retail/retail_2016_2017.csv", skiprows=range(1, 11000), nrows=1000
)

family_array = np.array(retail_df["family"])
sales_array = np.array(retail_df["sales"])

In [2]:
produce_array = sales_array[family_array == "PRODUCE"]

In [3]:
rng = np.random.default_rng(2022)

random_array = rng.random(30)

sampled_array = produce_array[random_array < 0.5]

In [4]:
mean = sampled_array.mean()

mean

2268.102470588235

In [5]:
median = np.median(sampled_array)

median

1272.755

In [6]:
np.where(
    sampled_array < median,
    "below_both",
    np.where(sampled_array > mean, "above_both", "above_median"),
)

array(['above_median', 'below_both', 'below_both', 'below_both',
       'above_both', 'below_both', 'below_both', 'above_both',
       'below_both', 'above_median', 'above_both', 'above_both',
       'below_both', 'above_median', 'above_both', 'below_both',
       'above_both'], dtype='<U12')

In [7]:
sampled_array

array([1662.394,  447.064,  962.866, 1077.44 , 3404.531,  962.96 ,
       1089.319, 7860.031,  446.038, 1272.755, 2775.771, 2339.906,
        722.333, 1567.843, 2458.456,  673.885, 8834.15 ])

In [8]:
oil = pd.read_csv("retail/oil.csv").dropna()

oil_array = np.array(oil["dcoilwtico"].iloc[1000:1100])

oil_array

array([52.22, 51.44, 51.98, 52.01, 52.82, 54.01, 53.8 , 53.75, 52.36,
       53.26, 53.77, 53.98, 51.95, 50.82, 52.19, 53.01, 52.36, 52.45,
       51.12, 51.39, 52.33, 52.77, 52.38, 52.14, 53.24, 53.18, 52.63,
       52.75, 53.9 , 53.55, 53.81, 53.01, 52.19, 52.37, 52.99, 53.84,
       52.96, 53.21, 53.11, 53.41, 53.41, 54.02, 53.61, 54.48, 53.99,
       54.04, 54.  , 53.82, 52.63, 53.33, 53.19, 52.68, 49.83, 48.75,
       48.05, 47.95, 47.24, 48.34, 48.3 , 48.34, 47.79, 47.02, 47.29,
       47.  , 47.3 , 47.02, 48.36, 49.47, 50.3 , 50.54, 50.25, 50.99,
       51.14, 51.69, 52.25, 53.06, 53.38, 53.12, 53.19, 52.62, 52.46,
       50.49, 50.26, 49.64, 48.9 , 49.22, 49.22, 48.96, 49.31, 48.83,
       47.65, 47.79, 45.55, 46.23, 46.46, 45.84, 47.28, 47.81, 47.83,
       48.86])

In [9]:
oil_series = pd.Series(oil_array, name="oil_prices")
              
oil_series

0     52.22
1     51.44
2     51.98
3     52.01
4     52.82
      ...  
95    45.84
96    47.28
97    47.81
98    47.83
99    48.86
Name: oil_prices, Length: 100, dtype: float64

In [10]:
print(f"Name: {oil_series.name}")
print(f"dtype: {oil_series.dtype}")
print(f"size: {oil_series.size}")
print(f"index: {oil_series.index}")

Name: oil_prices
dtype: float64
size: 100
index: RangeIndex(start=0, stop=100, step=1)


In [11]:
oil_series.values.mean()

51.128299999999996

In [12]:
oil_series.index

RangeIndex(start=0, stop=100, step=1)

In [13]:
oil_series.index.dtype

dtype('int64')

In [14]:
oil_series.astype("int").values.mean()

50.66

In [15]:
dates_array = np.array(oil["date"].iloc[1000:1100])

dates_array

array(['2016-12-20', '2016-12-21', '2016-12-22', '2016-12-23',
       '2016-12-27', '2016-12-28', '2016-12-29', '2016-12-30',
       '2017-01-03', '2017-01-04', '2017-01-05', '2017-01-06',
       '2017-01-09', '2017-01-10', '2017-01-11', '2017-01-12',
       '2017-01-13', '2017-01-17', '2017-01-18', '2017-01-19',
       '2017-01-20', '2017-01-23', '2017-01-24', '2017-01-25',
       '2017-01-26', '2017-01-27', '2017-01-30', '2017-01-31',
       '2017-02-01', '2017-02-02', '2017-02-03', '2017-02-06',
       '2017-02-07', '2017-02-08', '2017-02-09', '2017-02-10',
       '2017-02-13', '2017-02-14', '2017-02-15', '2017-02-16',
       '2017-02-17', '2017-02-21', '2017-02-22', '2017-02-23',
       '2017-02-24', '2017-02-27', '2017-02-28', '2017-03-01',
       '2017-03-02', '2017-03-03', '2017-03-06', '2017-03-07',
       '2017-03-08', '2017-03-09', '2017-03-10', '2017-03-13',
       '2017-03-14', '2017-03-15', '2017-03-16', '2017-03-17',
       '2017-03-20', '2017-03-21', '2017-03-22', '2017-

In [16]:
oil_series_with_dates = pd.Series(oil_array, index=dates_array, name="Oil Prices with Dates")

oil_series_with_dates

2016-12-20    52.22
2016-12-21    51.44
2016-12-22    51.98
2016-12-23    52.01
2016-12-27    52.82
              ...  
2017-05-09    45.84
2017-05-10    47.28
2017-05-11    47.81
2017-05-12    47.83
2017-05-15    48.86
Name: Oil Prices with Dates, Length: 100, dtype: float64

In [17]:
first_10 = oil_series_with_dates.iloc[0:10]
last_10 = oil_series_with_dates.iloc[-9:]

In [18]:
avg_first10 = first_10.mean()
print(avg_first10)

52.765


In [19]:
avg_last10 = last_10.mean()
print(avg_last10)

47.07222222222222


In [20]:
oil_prices_january = oil_series_with_dates.loc['2017-01-01':'2017-01-07']
oil_prices_january

2017-01-03    52.36
2017-01-04    53.26
2017-01-05    53.77
2017-01-06    53.98
Name: Oil Prices with Dates, dtype: float64

In [21]:
print(oil_prices_january.reset_index(drop=True))

0    52.36
1    53.26
2    53.77
3    53.98
Name: Oil Prices with Dates, dtype: float64


In [48]:
tenLowestPrices = oil_series_with_dates.sort_values(ascending=False)[-10:].sort_index(ascending=False)

tenLowestPrices

2017-05-10    47.28
2017-05-09    45.84
2017-05-08    46.46
2017-05-05    46.23
2017-05-04    45.55
2017-03-27    47.02
2017-03-23    47.00
2017-03-22    47.29
2017-03-21    47.02
2017-03-14    47.24
Name: Oil Prices with Dates, dtype: float64

In [60]:
given_dates = [
    "2016-12-22", 
    "2017-05-03", 
    "2017-01-06", 
    "2017-03-05", 
    "2017-02-12", 
    "2017-03-21", 
    "2017-04-14", 
    "2017-04-15", 
]

narrowed = (oil_series_with_dates.index.isin(given_dates)) & (oil_series_with_dates <= 50)

oil_series_with_dates[narrowed]

2017-03-21    47.02
2017-05-03    47.79
Name: Oil Prices with Dates, dtype: float64

In [68]:
increased_10 = oil_series_with_dates.multiply(1.10)
increased_10_2 = increased_10.add(2, fill_value=0)

print("Original Prices")
print(oil_series_with_dates)

print()
print("Increased by 10%")
print(increased_10)

print()
print("Added 2$ per barrel")
print(increased_10_2)

Original Prices
2016-12-20    52.22
2016-12-21    51.44
2016-12-22    51.98
2016-12-23    52.01
2016-12-27    52.82
              ...  
2017-05-09    45.84
2017-05-10    47.28
2017-05-11    47.81
2017-05-12    47.83
2017-05-15    48.86
Name: Oil Prices with Dates, Length: 100, dtype: float64

Increased by 10%
2016-12-20    57.442
2016-12-21    56.584
2016-12-22    57.178
2016-12-23    57.211
2016-12-27    58.102
               ...  
2017-05-09    50.424
2017-05-10    52.008
2017-05-11    52.591
2017-05-12    52.613
2017-05-15    53.746
Name: Oil Prices with Dates, Length: 100, dtype: float64

Added 2$ per barrel
2016-12-20    59.442
2016-12-21    58.584
2016-12-22    59.178
2016-12-23    59.211
2016-12-27    60.102
               ...  
2017-05-09    52.424
2017-05-10    54.008
2017-05-11    54.591
2017-05-12    54.613
2017-05-15    55.746
Name: Oil Prices with Dates, Length: 100, dtype: float64


In [74]:
max_price = oil_series_with_dates.max()
max_price

54.48

In [104]:
prices_diff = (oil_series_with_dates - max_price) / max_price
prices_diff.name = "Price difference each price and max price in %"
prices_diff

2016-12-20   -0.041483
2016-12-21   -0.055800
2016-12-22   -0.045888
2016-12-23   -0.045338
2016-12-27   -0.030470
                ...   
2017-05-09   -0.158590
2017-05-10   -0.132159
2017-05-11   -0.122430
2017-05-12   -0.122063
2017-05-15   -0.103157
Name: Price difference each price and max price in %, Length: 100, dtype: float64

In [102]:
months_series = pd.Series(dates_array, name="Months")

months_series = months_series.str.slice(5,7).astype("int")
months_series

0     12
1     12
2     12
3     12
4     12
      ..
95     5
96     5
97     5
98     5
99     5
Name: Months, Length: 100, dtype: int32