In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import re

In [2]:
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from catboost import CatBoostRegressor 

In [3]:
# Reading train and test data
df = pd.read_csv(r'C:\MachineHack\Participant_Data_Tea_Story\train.csv')
test = pd.read_csv(r'C:\MachineHack\Participant_Data_Tea_Story\test.csv')

In [4]:
print(f"Train set shape {df.shape}")
print(f"Test set shape {test.shape}")

Train set shape (544, 16)
Test set shape (29, 16)


## EDA, Preprocessing and Feature Engineering

In [5]:
df.head()

Unnamed: 0,WeekEnding_Date,Kolkata_Average_Price,Kolkata_Ref_Price,Bangalore_Average_Price,Bangalore_Ref_Price,Cochin_Average_Price,Cochin_Ref_Price,Darjeeling_Average_Price,Darjeeling_Ref_Price,Ernakulam_Average_Price,Ernakulam_Ref_Price,Siliguri_Average_Price,Siliguri_Ref_Price,Guwahati_Average_Price,Guwahati_Ref_Price,Average
0,03/01/09,99.01,79.79,N.S.,76.19,84.02,70.07,81.66,57.83,68.94,51.67,70.74,53.88,65.55,46.75,69.7
1,10/01/09,97.74,78.73,87.48,73.97,82.72,68.17,83.31,58.02,67.24,52.23,70.47,53.39,67.39,46.84,70.55
2,17/01/09,95.95,71.01,87.66,71.01,80.58,67.16,82.25,57.49,69.64,52.48,71.66,53.18,69.51,48.04,69.83
3,24/01/09,94.14,73.38,85.69,65.66,N.S.,65.57,80.87,54.59,N.S.,53.43,71.12,52.07,69.14,48.5,67.846667
4,31/01/09,91.45,70.39,N.S.,64.99,79.27,62.09,80.76,57.06,69.65,53.38,72.3,52.5,69.39,50.33,67.196923


In [6]:
# Checking for null values
df.isnull().sum()

WeekEnding_Date             0
Kolkata_Average_Price       0
Kolkata_Ref_Price           2
Bangalore_Average_Price     0
Bangalore_Ref_Price         2
Cochin_Average_Price        0
Cochin_Ref_Price            3
Darjeeling_Average_Price    0
Darjeeling_Ref_Price        2
Ernakulam_Average_Price     0
Ernakulam_Ref_Price         1
Siliguri_Average_Price      1
Siliguri_Ref_Price          1
Guwahati_Average_Price      0
Guwahati_Ref_Price          0
Average                     0
dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 544 entries, 0 to 543
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   WeekEnding_Date           544 non-null    object 
 1   Kolkata_Average_Price     544 non-null    object 
 2   Kolkata_Ref_Price         542 non-null    object 
 3   Bangalore_Average_Price   544 non-null    object 
 4   Bangalore_Ref_Price       542 non-null    object 
 5   Cochin_Average_Price      544 non-null    object 
 6   Cochin_Ref_Price          541 non-null    object 
 7   Darjeeling_Average_Price  544 non-null    object 
 8   Darjeeling_Ref_Price      542 non-null    object 
 9   Ernakulam_Average_Price   544 non-null    object 
 10  Ernakulam_Ref_Price       543 non-null    object 
 11  Siliguri_Average_Price    543 non-null    object 
 12  Siliguri_Ref_Price        543 non-null    object 
 13  Guwahati_Average_Price    544 non-null    object 
 14  Guwahati_R

In [8]:
# Converting date to datetime and extracting weekday feature
df['weekday'] = pd.to_datetime(df['WeekEnding_Date']).dt.weekday

test['weekday'] = pd.to_datetime(test['WeekEnding_Date']).dt.weekday

In [9]:
# Setting date as index
df.set_index('WeekEnding_Date', inplace=True)
test.set_index('WeekEnding_Date', inplace=True)

In [10]:
# All numerical columns except target column 
cols = [i for i in df.columns if i not in ['Average']]

In [11]:
# 'avg' feature which is the average of all other independent columns
def avg(cols, df):
    df['avg'] = 0
    for col in cols:
        df['avg'] += df[col].astype('float')
    df['avg'] /= len(cols)
    return df

In [12]:
# Using regex to remove texts like 'No sale', 'No. Sale', 'N. S.' etc.
def remove_text(text):
    no_text = re.sub('[^0-9.]', '', text)
    return no_text

In [13]:
# Removing text and filling the NaN values using forward and backward filling technique
for i in cols:
    df[i] = df[i].apply(lambda x: remove_text(str(x)))
    df[i] = df[i].apply(lambda x: np.nan if x in ['..', '', '.'] else x)
    df[i] = df[i].astype('float')
    df[i].fillna(method='ffill', inplace=True)
    df[i].fillna(method='bfill', inplace=True)
    
    
for i in cols:
    test[i] = test[i].apply(lambda x: remove_text(str(x)))
    test[i] = test[i].apply(lambda x: np.nan if x in ['..', '', '.'] else x)
    test[i] = test[i].astype('float')   
    test[i].fillna(method='ffill', inplace=True)
    test[i].fillna(method='bfill', inplace=True)

In [14]:
# Calling 'avg' function to create average feature
df = avg(cols, df)
test = avg(cols, test)


test.drop('Average', axis=1, inplace=True)

In [15]:
df.describe()

Unnamed: 0,Kolkata_Average_Price,Kolkata_Ref_Price,Bangalore_Average_Price,Bangalore_Ref_Price,Cochin_Average_Price,Cochin_Ref_Price,Darjeeling_Average_Price,Darjeeling_Ref_Price,Ernakulam_Average_Price,Ernakulam_Ref_Price,Siliguri_Average_Price,Siliguri_Ref_Price,Guwahati_Average_Price,Guwahati_Ref_Price,Average,weekday,avg
count,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0
mean,137.25489,131.644467,127.478327,121.630588,119.12864,113.86364,100.817463,95.589173,79.787224,76.863364,84.819449,80.965404,71.446562,69.312445,100.660317,4.270221,94.32479
std,23.646185,26.102421,23.124376,25.361527,19.036986,20.637354,16.669939,18.052455,14.671259,14.324043,15.564106,15.003531,14.76651,14.153677,13.711767,1.54686,12.37457
min,67.64,64.79,66.4,55.65,71.29,57.02,62.29,0.0,0.0,50.53,11.6,52.07,0.0,43.11,64.17125,0.0,61.214667
25%,122.885,113.0075,111.715,105.1625,105.99,100.3975,86.23,82.4425,69.3375,65.5875,72.43,70.19,59.4175,57.585,89.386429,4.0,83.88
50%,143.6,133.76,130.655,123.175,121.625,116.18,100.005,95.555,77.575,74.68,83.52,80.16,69.07,66.995,102.88623,5.0,96.655
75%,155.34,152.3925,145.04,141.25,131.8275,128.4775,112.525,109.6025,92.34,86.9825,97.01,92.535,84.23,80.2275,111.793036,5.0,104.574333
max,189.38,189.38,177.29,177.33,165.07,165.07,137.24,137.25,110.75,110.75,117.47,117.47,101.73,101.73,131.453333,6.0,116.106


In [16]:
# Separating predictor and target variables
X, y = df.drop(['Average'], axis=1), df['Average']

In [17]:
X.head()

Unnamed: 0_level_0,Kolkata_Average_Price,Kolkata_Ref_Price,Bangalore_Average_Price,Bangalore_Ref_Price,Cochin_Average_Price,Cochin_Ref_Price,Darjeeling_Average_Price,Darjeeling_Ref_Price,Ernakulam_Average_Price,Ernakulam_Ref_Price,Siliguri_Average_Price,Siliguri_Ref_Price,Guwahati_Average_Price,Guwahati_Ref_Price,weekday,avg
WeekEnding_Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
03/01/09,99.01,79.79,87.48,76.19,84.02,70.07,81.66,57.83,68.94,51.67,70.74,53.88,65.55,46.75,6.0,66.638667
10/01/09,97.74,78.73,87.48,73.97,82.72,68.17,83.31,58.02,67.24,52.23,70.47,53.39,67.39,46.84,3.0,66.046667
17/01/09,95.95,71.01,87.66,71.01,80.58,67.16,82.25,57.49,69.64,52.48,71.66,53.18,69.51,48.04,5.0,65.508
24/01/09,94.14,73.38,85.69,65.66,80.58,65.57,80.87,54.59,69.64,53.43,71.12,52.07,69.14,48.5,5.0,64.625333
31/01/09,91.45,70.39,85.69,64.99,79.27,62.09,80.76,57.06,69.65,53.38,72.3,52.5,69.39,50.33,5.0,64.283333


## LightGBM  

In [18]:
def lgb_model():
    scores = []
    splits=20
    
    oof=np.zeros(len(X))
    test_pred =np.zeros(len(test))
    #holdout_pred = np.zeros(len(test_X))
    
    print('***********************************************************')
    kf = KFold(n_splits=splits, shuffle=False)
    for fold, (train_index, test_index) in enumerate(kf.split(X, y)):
        
        X_train, X_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]
        
        model = lgb.LGBMRegressor(n_estimators=10000, learning_rate=0.06, random_state=100, max_depth=10, num_leaves=90)#, colsample_bytree=0.8)

        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=300, verbose=False)
        pred = model.predict(X_val)                                               
        oof[test_index] = pred
        score = np.sqrt(mean_squared_error(y_val, pred))
        
        print(f'rmse score for fold {fold} is {score}')
        scores.append(score)
        
        test_pred += model.predict(test)
            
    print(f'\nAvg score for all folds is {np.sum(scores)/splits}')
    
    print('***********************************************************')
    print(f'\nOOF Score after completing folds is {np.sqrt(mean_squared_error(y, oof))}')
    test_df = pd.DataFrame(test_pred, columns=['Average'])
    test_df = test_df/splits
    #print(f'\nRMSE Score for HOLDOUT Data is {np.sqrt(mean_squared_log_error(np.expm1(test_y), np.expm1(holdout_pred/8)))}')
    return test_df

In [19]:
lgb_ = lgb_model()

***********************************************************
rmse score for fold 0 is 3.7077418004177902
rmse score for fold 1 is 2.204992093003887
rmse score for fold 2 is 1.6149520792580836
rmse score for fold 3 is 2.176307736411075
rmse score for fold 4 is 2.2903001521054747
rmse score for fold 5 is 4.872771557062859
rmse score for fold 6 is 2.206690383100683
rmse score for fold 7 is 5.033071680449239
rmse score for fold 8 is 3.1179431627630954
rmse score for fold 9 is 5.901846479480736
rmse score for fold 10 is 1.5950511840094912
rmse score for fold 11 is 5.805880429818514
rmse score for fold 12 is 1.765825763617948
rmse score for fold 13 is 4.4165992561761005
rmse score for fold 14 is 2.338170042155939
rmse score for fold 15 is 1.7772528109768972
rmse score for fold 16 is 2.8112437737511393
rmse score for fold 17 is 2.1729179223874073
rmse score for fold 18 is 1.9192807742518456
rmse score for fold 19 is 2.2719924899879347

Avg score for all folds is 3.000041578559307
*************

## XGBoost 

In [20]:
def xgb_model():
    scores = []
    splits=20
    
    oof=np.zeros(len(X))
    test_pred =np.zeros(len(test))
    #holdout_pred = np.zeros(len(test_X))
    
    print('***********************************************************')
    kf = KFold(n_splits=splits, shuffle=False)
    for fold, (train_index, test_index) in enumerate(kf.split(X, y)):
        
        X_train, X_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]
        
        model = xgb.XGBRegressor(n_estimators=10000, learning_rate=0.06, random_state=100, max_depth=10)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=30, verbose=False)
        pred = model.predict(X_val)                                              
        oof[test_index] = pred
        score = np.sqrt(mean_squared_error(y_val, pred))
        
        print(f'rmse score for fold {fold} is {score}')
        scores.append(score)

        test_pred += model.predict(test)

    print(f'\nAvg score for all folds is {np.sum(scores)/splits}')
    
    print('***********************************************************')
    print(f'\nOOF Score after completing folds is {np.sqrt(mean_squared_error(y, oof))}')
    
    test_df = pd.DataFrame(test_pred, columns=['Average'])
    test_df = test_df/splits
    #print(f'\nRMSE Score for HOLDOUT Data is {np.sqrt(mean_squared_log_error(np.expm1(test_y), np.expm1(holdout_pred/8)))}')
    return test_df

In [21]:
xgb_ = xgb_model()

***********************************************************
rmse score for fold 0 is 1.782050993723448
rmse score for fold 1 is 2.153147753238721
rmse score for fold 2 is 1.9340701479985536
rmse score for fold 3 is 2.6611497214771274
rmse score for fold 4 is 1.74371058198841
rmse score for fold 5 is 5.148457478283427
rmse score for fold 6 is 1.9059715301582343
rmse score for fold 7 is 5.070603063087166
rmse score for fold 8 is 3.0465399617331133
rmse score for fold 9 is 6.065809180510796
rmse score for fold 10 is 2.3997936140967413
rmse score for fold 11 is 6.33164329646534
rmse score for fold 12 is 1.9767087124999974
rmse score for fold 13 is 4.126280888674146
rmse score for fold 14 is 2.715854968949943
rmse score for fold 15 is 2.0428405768132993
rmse score for fold 16 is 2.8536054312570416
rmse score for fold 17 is 2.983983526158258
rmse score for fold 18 is 1.7485033101303544
rmse score for fold 19 is 1.9602369968318678

Avg score for all folds is 3.032548086703799
****************

#### Weighted Average of LightGBM and XGBoost based predictions

In [22]:
sub = (0.7*lgb_) + (0.3*xgb_)

####   Saving CSV file

In [23]:
sub.to_csv(r'C:\MachineHack\Participant_Data_Tea_Story\submissions\sub_pred.csv', index=False)

In [24]:
sub.head()

Unnamed: 0,Average
0,119.618446
1,119.592254
2,119.562632
3,119.409095
4,119.39621
