In [1]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.0.0-py3-none-win_amd64.whl (1.3 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-4.0.0


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

In [3]:
# Load your dataset
data = pd.read_csv('train.csv')

In [5]:
# Identify and handle missing values
null_counts = data.isnull().sum()
print(null_counts)

UID        0
col_0    139
col_1     86
col_2      0
col_3      0
col_4      0
col_5      0
col_6    112
y          0
dtype: int64


In [6]:
# Fill missing values in numeric columns with mean
data.fillna(' ', inplace=True)

In [7]:
# Fill missing values in text columns with "Unknown" (you can use any other value)
text_columns = ['UID', 'col_0', 'col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6']
for col in text_columns:
    data[col].fillna("Unknown", inplace=True)

In [8]:
# Label encode text columns
label_encoders = {}
for col in text_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

In [9]:
# Split the data into features (X) and target (y)
X = data.drop('y', axis=1)
X = X.drop('UID', axis=1)
y = data['y']
y = y/1000000

In [10]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)

In [11]:
# Set hyperparameters for the LightGBM model (you can tune these)
params = {
    'objective': 'regression',
    'metric': 'mean_squared_error',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
}

# Train the LightGBM model
num_round = 100
bst = lgb.train(params, train_data, num_round)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 2102, number of used features: 7
[LightGBM] [Info] Start training from score 0.137180


In [12]:
# Make predictions
y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error on Test Data: {mse}')

# You can use y_pred for further analysis or evaluation

Mean Squared Error on Test Data: 0.002631299575532398


In [16]:
# Prepare new input data in the same format as your original dataset
# You can create a DataFrame with the same column names and encode text columns
new_data = pd.DataFrame({
    'col_0': ['A0'],
    'col_1': ['B0'],
    'col_2': ['C2'],
    'col_3': ['D1'],
    'col_4': [100],
    'col_5': ['E1'],
    'col_6': ['F2']
})

In [17]:
# Label encode text columns using the same label encoders as before
text_columns = ['col_0', 'col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6']
for col in text_columns:
    le = label_encoders[col]
    new_data[col] = le.transform(new_data[col])

In [20]:
# Make predictions with the loaded model
prediction = bst.predict(new_data, num_iteration=bst.best_iteration)

# Print the prediction
print(f'Predicted Output: {prediction[0]*1000000}')

Predicted Output: 164011.62217996744
