In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
import warnings

In [2]:
file_train = '/content/counterfeit_train.csv'
file_test = '/content/counterfeit_test.csv'
df_train = pd.read_csv(file_train, delimiter=',', encoding="utf-8-sig")
df_test = pd.read_csv(file_test, delimiter=',', encoding="utf-8-sig")

In [3]:
df_train

Unnamed: 0,Medicine_ID,Counterfeit_Weight,DistArea_ID,Active_Since,Medicine_MRP,Medicine_Type,SidEffect_Level,Availability_rating,Area_Type,Area_City_Type,Area_dist_level,Counterfeit_Sales
0,RRA15,13.100,Area046,1995,160.2366,Antimalarial,critical,0.070422,DownTown,Tier 1,Small,1775.5026
1,YVV26,,Area027,1983,110.4384,Mstablizers,mild,0.013000,CityLimits,Tier 3,Medium,3069.1520
2,LJC15,9.025,Area046,1995,259.4092,Cardiac,mild,0.060783,DownTown,Tier 1,Small,2603.0920
3,GWC40,11.800,Area046,1995,99.9830,OralContraceptives,mild,0.065555,DownTown,Tier 1,Small,1101.7130
4,QMN13,,Area019,1983,56.4402,Hreplacements,critical,0.248859,MidTownResidential,Tier 1,Small,158.9402
...,...,...,...,...,...,...,...,...,...,...,...,...
6813,OYN80,8.535,Area046,1995,204.1452,Hreplacements,mild,0.112963,DownTown,Tier 1,Small,2070.4520
6814,ACW12,20.650,Area046,1995,235.1088,Hreplacements,mild,0.131103,DownTown,Tier 1,Small,2126.3792
6815,OPM10,20.000,Area017,2005,193.6292,Antimalarial,critical,0.105096,DownTown,Tier 2,Unknown,2119.7212
6816,SLY12,10.180,Area045,2000,162.8682,Statins,mild,0.099957,DownTown,Tier 2,Unknown,1485.2138


In [4]:
df_test

Unnamed: 0,Medicine_ID,Counterfeit_Weight,DistArea_ID,Active_Since,Medicine_MRP,Medicine_Type,SidEffect_Level,Availability_rating,Area_Type,Area_City_Type,Area_dist_level
0,HLZ81,,Area027,1983,85.5328,Antibiotics,mild,0.112747,CityLimits,Tier 3,Medium
1,ECE94,13.45,Area045,2000,257.1460,OralContraceptives,mild,0.144446,DownTown,Tier 2,Unknown
2,SAD14,7.10,Area045,2000,98.1172,Antipyretics,mild,0.144221,DownTown,Tier 2,Unknown
3,EQV63,18.30,Area010,1996,135.3730,Tranquilizers,mild,0.100388,MidTownResidential,Tier 3,Unknown
4,AIR10,,Area019,1983,112.8016,OralContraceptives,mild,0.022585,MidTownResidential,Tier 1,Small
...,...,...,...,...,...,...,...,...,...,...,...
1700,KXW10,,Area027,1983,136.5704,Hreplacements,mild,0.050505,CityLimits,Tier 3,Medium
1701,CKE54,21.30,Area035,2002,57.0744,Antibiotics,critical,0.041118,DownTown,Tier 2,Small
1702,HAY13,20.40,Area017,2005,182.7422,Antiseptics,mild,0.191273,DownTown,Tier 2,Unknown
1703,ZEE32,20.00,Area018,2007,266.9672,Hreplacements,mild,0.013000,Industrial,Tier 3,Medium


In [5]:
df_train.isnull().sum()

Unnamed: 0,0
Medicine_ID,0
Counterfeit_Weight,1166
DistArea_ID,0
Active_Since,0
Medicine_MRP,0
Medicine_Type,0
SidEffect_Level,0
Availability_rating,0
Area_Type,0
Area_City_Type,0


In [6]:
df_test.isnull().sum()

Unnamed: 0,0
Medicine_ID,0
Counterfeit_Weight,297
DistArea_ID,0
Active_Since,0
Medicine_MRP,0
Medicine_Type,0
SidEffect_Level,0
Availability_rating,0
Area_Type,0
Area_City_Type,0


In [7]:
df_train['Medicine_ID'].nunique()

1557

In [8]:
df_train.shape

(6818, 12)

In [9]:
train_new=df_train

In [10]:
test_new=df_test

In [11]:
for col in ['Medicine_Type','SidEffect_Level','Area_Type','Area_City_Type','Area_dist_level', 'DistArea_ID']:
   temp=pd.get_dummies(train_new[col],prefix=col,drop_first=True)
   train_new=pd.concat([temp,train_new],axis=1)
   train_new.drop([col],axis=1,inplace=True)

   temp=pd.get_dummies(test_new[col],prefix=col,drop_first=True)
   test_new=pd.concat([temp,test_new],axis=1)
   test_new.drop([col],axis=1,inplace=True)

In [12]:
train_new.isnull().sum()

Unnamed: 0,0
DistArea_ID_Area013,0
DistArea_ID_Area017,0
DistArea_ID_Area018,0
DistArea_ID_Area019,0
DistArea_ID_Area027,0
DistArea_ID_Area035,0
DistArea_ID_Area045,0
DistArea_ID_Area046,0
DistArea_ID_Area049,0
Area_dist_level_Medium,0


In [13]:
train_new['Counterfeit_Weight'].fillna(train_new['Counterfeit_Weight'].mean(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_new['Counterfeit_Weight'].fillna(train_new['Counterfeit_Weight'].mean(),inplace=True)


In [14]:
train_new['Counterfeit_Weight'].isnull().sum()

np.int64(0)

In [15]:
train_new.shape

(6818, 39)

In [16]:
X = train_new.drop(['Medicine_ID','Counterfeit_Sales'], axis=1)

In [17]:
y = train_new['Counterfeit_Sales']

In [18]:
X.shape,y.shape

((6818, 37), (6818,))

In [19]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
model = lgb.LGBMRegressor(objective='regression', device='gpu',  random_state=1)

In [21]:
random_param_grid = [{
    'n_estimators':[250, 500, 1000],
    'learning_rate':[0.0025, 0.005, 0.01],
    'max_depth':[-1],
    'num_leaves':[7, 15],
    'min_child_samples':[1, 10, 20],
    'colsample_bytree':[0.5, 0.7, 1.0],
    'subsample':[0.5, 0.7, 1.0],
    'reg_alpha':[1.6, 2.4]
}]

In [22]:
rs = RandomizedSearchCV(model, random_param_grid, cv = 10, verbose = 2)

In [23]:
rs.fit(X_train,y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 811
[LightGBM] [Info] Number of data points in the train set: 4908, number of used features: 37
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 10 dense feature groups (0.06 MB) transferred to GPU in 0.000736 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 2284.167696
[CV] END colsample_bytree=0.5, learning_rate=0.0025, max_depth=-1, min_child_samples=1, n_estimators=500, num_leaves=7, reg_alpha=1.6, subsample=0.7; total time=   6.6s
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 812
[LightGBM] [Info] Number of data points in the train set: 4908, number of used features: 37
[LightGBM] [Info] Using GPU Devic

In [24]:
rs.best_estimator_

In [25]:
model = rs.best_estimator_

In [26]:
model.fit(X_train, y_train)

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 812
[LightGBM] [Info] Number of data points in the train set: 5454, number of used features: 37
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 10 dense feature groups (0.06 MB) transferred to GPU in 0.000465 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 2290.666144


In [27]:
val = model.predict(X_val)

In [28]:
val

array([ 352.27982758, 1028.6381919 , 3370.6385102 , ...,  884.65739089,
       4110.19268802,  813.15896316])

In [29]:
val.shape,y_val.shape

((1364,), (1364,))

In [30]:
err = y_val-val

In [31]:
MAE = mean_absolute_error(y_val,val)

In [32]:
MAE

744.9849449622889

In [33]:
Score = 1-(MAE/1660)

In [34]:
Score

0.5512138885769344

In [35]:
test_new.shape,X_train.shape

((1705, 38), (5454, 37))

In [36]:
X_test = test_new.drop(columns='Medicine_ID')

In [37]:
X_test.shape,X_val.shape

((1705, 37), (1364, 37))

In [38]:
pred=model.predict(X_test)

In [39]:
submissions = pd.DataFrame(list(zip(test_new['Medicine_ID'],list(pred))),
                       columns=['Medicine_ID','Counterfeit_Sales'])

In [40]:
submissions

Unnamed: 0,Medicine_ID,Counterfeit_Sales
0,HLZ81,2176.357713
1,ECE94,3908.346191
2,SAD14,1545.121218
3,EQV63,440.439057
4,AIR10,513.724879
...,...,...
1700,KXW10,3344.007530
1701,CKE54,867.423941
1702,HAY13,3068.519754
1703,ZEE32,3633.700179


In [41]:
submissions.to_csv('proj_submission.csv',index=False)