### Datasetda mavjud bo'lmagan kunlarni qo'shish

In [3]:
import pandas as pd
from pathlib import Path

In [2]:

pd.set_option('display.float_format', lambda v: f'{v:,.2f}')

In [4]:
data_path = Path('datasets/usd_rates60.csv')
df = pd.read_csv(data_path)
df['Date'] = pd.to_datetime(df['Date'], format='%d.%m.%Y', errors='coerce')
df = df.sort_values(['Ccy', 'Date']).dropna(subset=['Date']).reset_index(drop=True)
print(f'Satrlar: {len(df):,}; Ustunlar: {df.shape[1]}')
df.head()


Satrlar: 58; Ustunlar: 11


Unnamed: 0,id,Code,Ccy,CcyNm_RU,CcyNm_UZ,CcyNm_UZC,CcyNm_EN,Nominal,Rate,Diff,Date
0,1,840,USD,Доллар США,AQSH dollari,АҚШ доллари,US Dollar,1,8079.28,40.28,2018-09-25
1,1,840,USD,Доллар США,AQSH dollari,АҚШ доллари,US Dollar,1,8079.28,40.28,2018-09-25
2,1,840,USD,Доллар США,AQSH dollari,АҚШ доллари,US Dollar,1,8179.66,100.38,2018-10-03
3,1,840,USD,Доллар США,AQSH dollari,АҚШ доллари,US Dollar,1,8179.66,100.38,2018-10-03
4,1,840,USD,Доллар США,AQSH dollari,АҚШ доллари,US Dollar,1,8179.66,100.38,2018-10-03


In [5]:
gap_summary = []
for ccy, g in df.groupby('Ccy'):
    g = g.sort_values('Date')
    gaps = (g['Date'].diff().dt.days.fillna(0) - 1).clip(lower=0)
    gap_summary.append({'Ccy': ccy, 'missing_days': int(gaps.sum())})

gap_df = pd.DataFrame(gap_summary).sort_values('missing_days', ascending=False)
gap_df


Unnamed: 0,Ccy,missing_days
0,USD,54


In [6]:
def fill_missing_days(group: pd.DataFrame) -> pd.DataFrame:
    group = group.sort_values('Date').drop_duplicates(subset='Date', keep='last')
    full_idx = pd.date_range(group['Date'].min(), group['Date'].max(), freq='D')
    filled = group.set_index('Date').reindex(full_idx).ffill()
    filled['Date'] = filled.index
    return filled.reset_index(drop=True)

filled_df = df.groupby('Ccy', group_keys=False).apply(fill_missing_days)
filled_df = filled_df.sort_values(['Ccy', 'Date']).reset_index(drop=True)

print(f'Oldingi satrlar: {len(df):,}; Yangi satrlar: {len(filled_df):,}')
filled_df.tail(10)


Oldingi satrlar: 58; Yangi satrlar: 64


  filled_df = df.groupby('Ccy', group_keys=False).apply(fill_missing_days)


Unnamed: 0,id,Code,Ccy,CcyNm_RU,CcyNm_UZ,CcyNm_UZC,CcyNm_EN,Nominal,Rate,Diff,Date
54,1.0,840.0,USD,Доллар США,AQSH dollari,АҚШ доллари,US Dollar,1.0,8271.27,21.3,2018-11-18
55,1.0,840.0,USD,Доллар США,AQSH dollari,АҚШ доллари,US Dollar,1.0,8271.27,21.3,2018-11-19
56,1.0,840.0,USD,Доллар США,AQSH dollari,АҚШ доллари,US Dollar,1.0,8277.53,6.26,2018-11-20
57,1.0,840.0,USD,Доллар США,AQSH dollari,АҚШ доллари,US Dollar,1.0,8277.53,6.26,2018-11-21
58,1.0,840.0,USD,Доллар США,AQSH dollari,АҚШ доллари,US Dollar,1.0,8277.53,6.26,2018-11-22
59,1.0,840.0,USD,Доллар США,AQSH dollari,АҚШ доллари,US Dollar,1.0,8277.53,6.26,2018-11-23
60,1.0,840.0,USD,Доллар США,AQSH dollari,АҚШ доллари,US Dollar,1.0,8277.53,6.26,2018-11-24
61,1.0,840.0,USD,Доллар США,AQSH dollari,АҚШ доллари,US Dollar,1.0,8277.53,6.26,2018-11-25
62,1.0,840.0,USD,Доллар США,AQSH dollari,АҚШ доллари,US Dollar,1.0,8277.53,6.26,2018-11-26
63,1.0,840.0,USD,Доллар США,AQSH dollari,АҚШ доллари,US Dollar,1.0,8299.2,21.67,2018-11-27


In [7]:
has_gap = (
    filled_df.sort_values(['Ccy', 'Date'])
    .groupby('Ccy')['Date']
    .diff()
    .dt.days
    .fillna(1)
    .gt(1)
    .any()
)
print('Bo`shliq qoldimi?:', bool(has_gap))
filled_df.tail()


Bo`shliq qoldimi?: False


Unnamed: 0,id,Code,Ccy,CcyNm_RU,CcyNm_UZ,CcyNm_UZC,CcyNm_EN,Nominal,Rate,Diff,Date
59,1.0,840.0,USD,Доллар США,AQSH dollari,АҚШ доллари,US Dollar,1.0,8277.53,6.26,2018-11-23
60,1.0,840.0,USD,Доллар США,AQSH dollari,АҚШ доллари,US Dollar,1.0,8277.53,6.26,2018-11-24
61,1.0,840.0,USD,Доллар США,AQSH dollari,АҚШ доллари,US Dollar,1.0,8277.53,6.26,2018-11-25
62,1.0,840.0,USD,Доллар США,AQSH dollari,АҚШ доллари,US Dollar,1.0,8277.53,6.26,2018-11-26
63,1.0,840.0,USD,Доллар США,AQSH dollari,АҚШ доллари,US Dollar,1.0,8299.2,21.67,2018-11-27


In [None]:
output_path = Path('datasets/usd_rates_filled.csv')
filled_df.to_csv(output_path, index=False)
output_path

### Lag, rolling va calendar featurelar 

Quyidagi kod `rate` bo'yicha 1, 7 va 30 kunlik lag va rolling xususiyatlarni hosil qiladi.

In [12]:
# Faylni yuklaymiz
data_path = Path('datasets/usd_rates_filled.csv')
df = pd.read_csv(data_path)

# Ustun nomlarini qulay foydalanish uchun kichik harflarga o'girib olamiz
df.columns = [c.lower() for c in df.columns]

# Sanani datetime formatiga o'tkazib, vaqt bo'yicha tartiblaymiz
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date')

# Faqat kerakli ustunlarni olamiz: sana va kurs (rate)
df = df[['rate', 'date', 'diff']].reset_index(drop=True)

df.head()

Unnamed: 0,rate,date,diff
0,8226.1,2018-10-26,18.06
1,8226.1,2018-10-27,18.06
2,8226.1,2018-10-28,18.06
3,8226.1,2018-10-29,18.06
4,8236.87,2018-10-30,10.77


In [13]:
# Lag va rolling oynalar (kunlarda)
lags = [1, 7, 30]
windows = [1, 7, 30]

features = df.copy()

for lag in lags:
    # "lag_{lag}" — kursning avvalgi {lag} kunlik qiymati
    features[f'lag_{lag}'] = features['rate'].shift(lag)

for win in windows:
    # "roll_min_{win}" — so'nggi {win} kun ichidagi minimal kurs
    features[f'roll_min_{win}'] = features['rate'].rolling(win).min()
    # "roll_max_{win}" — so'nggi {win} kun ichidagi maksimal kurs
    features[f'roll_max_{win}'] = features['rate'].rolling(win).max()
    # "roll_mean_{win}" — so'nggi {win} kunlik o'rtacha kurs
    features[f'roll_mean_{win}'] = features['rate'].rolling(win).mean()
    # "roll_median_{win}" — so'nggi {win} kunlik median kurs
    features[f'roll_median_{win}'] = features['rate'].rolling(win).median()
    # "roll_std_{win}" — so'nggi {win} kunlik dispersiya (standart chetlanish)
    #features[f'roll_std_{win}'] = features['rate'].rolling(win).std()

#features.head(35)
# Modelga tayyor bo'lishi uchun NaNlarni tashlab yuboramiz (lag/rolling tufayli paydo bo'lgan)
features_clean = features.dropna().reset_index(drop=True)

# Natijaviy ustunlar ro'yxati
features_clean.head()

Unnamed: 0,rate,date,diff,lag_1,lag_7,lag_30,roll_min_1,roll_max_1,roll_mean_1,roll_median_1,roll_min_7,roll_max_7,roll_mean_7,roll_median_7,roll_min_30,roll_max_30,roll_mean_30,roll_median_30
0,8277.53,2018-11-25,6.26,8277.53,8271.27,8226.1,8277.53,8277.53,8277.53,8277.53,8271.27,8277.53,8276.64,8277.53,8226.1,8277.53,8255.01,8249.97
1,8277.53,2018-11-26,6.26,8277.53,8271.27,8226.1,8277.53,8277.53,8277.53,8277.53,8277.53,8277.53,8277.53,8277.53,8226.1,8277.53,8256.72,8249.97
2,8299.2,2018-11-27,21.67,8277.53,8277.53,8226.1,8299.2,8299.2,8299.2,8299.2,8277.53,8299.2,8280.63,8277.53,8226.1,8299.2,8259.16,8260.62
3,8299.2,2018-11-28,21.67,8299.2,8277.53,8226.1,8299.2,8299.2,8299.2,8299.2,8277.53,8299.2,8283.72,8277.53,8236.87,8299.2,8261.6,8271.27
4,8299.2,2018-11-29,21.67,8299.2,8277.53,8236.87,8299.2,8299.2,8299.2,8299.2,8277.53,8299.2,8286.82,8277.53,8236.87,8299.2,8263.67,8271.27


In [14]:
# Kalendar xususiyatlari: kun/oy/chorak va dam olish flaglari (hammasi raqam)
calendar_features = features.copy()
calendar_features['day_of_week'] = calendar_features['date'].dt.dayofweek  # haftaning kuni (0=dushanba)
calendar_features['day_of_month'] = calendar_features['date'].dt.day       # oy kuni (1-31)
calendar_features['month'] = calendar_features['date'].dt.month            # oy (1-12)
calendar_features['quarter'] = calendar_features['date'].dt.quarter        # chorak (1-4)
calendar_features['is_month_start'] = calendar_features['date'].dt.is_month_start.astype(int)  # oy boshimi (1/0)
calendar_features['is_month_end'] = calendar_features['date'].dt.is_month_end.astype(int)      # oy oxirimi (1/0)
calendar_features['is_weekend'] = calendar_features['date'].dt.dayofweek.isin([5,6]).astype(int)  # dam olishmi (1/0)

# Lag/rolling + kalendar ustunlarini tozalaymiz (NaNlarni olib tashlaymiz)
calendar_features_clean = calendar_features.dropna().reset_index(drop=True)
calendar_features_clean.sample(5)


Unnamed: 0,rate,date,diff,lag_1,lag_7,lag_30,roll_min_1,roll_max_1,roll_mean_1,roll_median_1,...,roll_max_30,roll_mean_30,roll_median_30,day_of_week,day_of_month,month,quarter,is_month_start,is_month_end,is_weekend
1412,11063.05,2022-10-07,34.91,11028.14,11014.01,10971.22,11063.05,11063.05,11063.05,11063.05,...,11063.05,10989.79,10992.41,4,7,10,4,0,0,0
470,9513.84,2020-03-09,-12.44,9513.84,9526.28,9541.74,9513.84,9513.84,9513.84,9513.84,...,9541.74,9527.93,9528.49,0,9,3,1,0,0,0
595,10184.73,2020-07-12,11.35,10184.73,10173.38,10153.19,10184.73,10184.73,10184.73,10184.73,...,10184.73,10168.47,10172.76,6,12,7,3,0,0,1
1109,10769.78,2021-12-08,-5.56,10769.78,10778.98,10705.26,10769.78,10769.78,10769.78,10769.78,...,10787.03,10750.97,10757.57,2,8,12,4,0,0,0
76,8401.53,2019-02-09,16.59,8401.53,8384.94,8337.07,8401.53,8401.53,8401.53,8401.53,...,8401.53,8373.03,8378.18,5,9,2,1,0,0,1


In [15]:
calendar_features_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2572 entries, 0 to 2571
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   rate            2572 non-null   float64       
 1   date            2572 non-null   datetime64[ns]
 2   diff            2572 non-null   float64       
 3   lag_1           2572 non-null   float64       
 4   lag_7           2572 non-null   float64       
 5   lag_30          2572 non-null   float64       
 6   roll_min_1      2572 non-null   float64       
 7   roll_max_1      2572 non-null   float64       
 8   roll_mean_1     2572 non-null   float64       
 9   roll_median_1   2572 non-null   float64       
 10  roll_min_7      2572 non-null   float64       
 11  roll_max_7      2572 non-null   float64       
 12  roll_mean_7     2572 non-null   float64       
 13  roll_median_7   2572 non-null   float64       
 14  roll_min_30     2572 non-null   float64       
 15  roll

In [17]:
output_path = Path('datasets/usd_rates_ready.csv')
calendar_features_clean.to_csv(output_path, index=False)
output_path

WindowsPath('datasets/usd_rates_ready.csv')