In [1]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

In [2]:
startups_df = pd.read_excel('startups.xlsx', sheet_name='Request 2')

In [3]:
# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(startups_df['Description'])

In [4]:
# Define the model
model = lgb.LGBMClassifier()
# Define the parameter grid
#Tried the grid search with much more parameters keeping only the minimal ones here for faster running
param_grid = {
    'num_leaves': [150, 200],
    'learning_rate': [0.1, 0.2],
    'n_estimators': [300, 400, 500],
}
# Set up GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)
X = tfidf_matrix  # The feature matrix
y = startups_df['Industry'] 
# Fit GridSearchCV on the entire dataset
grid_search.fit(X, y)
# Best cross-validation score
best_cv_score = grid_search.best_score_
print(f'Best Cross-Validation Accuracy: {best_cv_score}')

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.221314 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 339105
[LightGBM] [Info] Number of data points in the train set: 48071, number of used features: 7077
[LightGBM] [Info] Start training from score -0.514041
[LightGBM] [Info] Start training from score -1.946389
[LightGBM] [Info] Start training from score -1.350396
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.250817 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 380432
[LightGBM] [Info] Number of data points in the train set: 48071, number of used features: 7759
[LightGBM] [Info] Start training from score -0.514041
[LightGBM] [Info] Start training from score -1.946389
[LightGBM] [Info] Start training from score -1.350396
[LightGBM] [Info] Auto-choosin

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.300847 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 402332
[LightGBM] [Info] Number of data points in the train set: 48071, number of used features: 8089
[LightGBM] [Info] Start training from score -0.514041
[LightGBM] [Info] Start training from score -1.946389
[LightGBM] [Info] Start training from score -1.350396
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.410397 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 420035
[LightGBM] [Info] Number of data points in the train set: 48071, number of used features: 8361
[LightGBM] [Info] Start training from score -0.514007
[LightGBM] [Info] Start training from score -1.946389
[LightGBM] [Info] Start training from score -1.350477


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.394806 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 435932
[LightGBM] [Info] Number of data points in the train set: 48072, number of used features: 8593
[LightGBM] [Info] Start training from score -0.514027
[LightGBM] [Info] Start training from score -1.946410
[LightGBM] [Info] Start training from score -1.350417


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.447006 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 339105
[LightGBM] [Info] Number of data points in the train set: 48071, number of used features: 7077
[LightGBM] [Info] Start training from score -0.514041
[LightGBM] [Info] Start training from score -1.946389
[LightGBM] [Info] Start training from score -1.350396




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.293674 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 380432
[LightGBM] [Info] Number of data points in the train set: 48071, number of used features: 7759
[LightGBM] [Info] Start training from score -0.514041
[LightGBM] [Info] Start training from score -1.946389
[LightGBM] [Info] Start training from score -1.350396




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.328043 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 402332
[LightGBM] [Info] Number of data points in the train set: 48071, number of used features: 8089
[LightGBM] [Info] Start training from score -0.514041
[LightGBM] [Info] Start training from score -1.946389
[LightGBM] [Info] Start training from score -1.350396


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.327080 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 420035
[LightGBM] [Info] Number of data points in the train set: 48071, number of used features: 8361
[LightGBM] [Info] Start training from score -0.514007
[LightGBM] [Info] Start training from score -1.946389
[LightGBM] [Info] Start training from score -1.350477




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.326334 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 435932
[LightGBM] [Info] Number of data points in the train set: 48072, number of used features: 8593
[LightGBM] [Info] Start training from score -0.514027
[LightGBM] [Info] Start training from score -1.946410
[LightGBM] [Info] Start training from score -1.350417






[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.297872 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 339105
[LightGBM] [Info] Number of data points in the train set: 48071, number of used features: 7077
[LightGBM] [Info] Start training from score -0.514041
[LightGBM] [Info] Start training from score -1.946389
[LightGBM] [Info] Start training from score -1.350396








[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.305688 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 380432
[LightGBM] [Info] Number of data points in the train set: 48071, number of used features: 7759
[LightGBM] [Info] Start training from score -0.514041
[LightGBM] [Info] Start training from score -1.946389
[LightGBM] [Info] Start training from score -1.350396










[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.330740 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 402332
[LightGBM] [Info] Number of data points in the train set: 48071, number of used features: 8089
[LightGBM] [Info] Start training from score -0.514041
[LightGBM] [Info] Start training from score -1.946389
[LightGBM] [Info] Start training from score -1.350396










[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.527668 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 420035
[LightGBM] [Info] Number of data points in the train set: 48071, number of used features: 8361
[LightGBM] [Info] Start training from score -0.514007
[LightGBM] [Info] Start training from score -1.946389
[LightGBM] [Info] Start training from score -1.350477










[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.330675 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 435932
[LightGBM] [Info] Number of data points in the train set: 48072, number of used features: 8593
[LightGBM] [Info] Start training from score -0.514027
[LightGBM] [Info] Start training from score -1.946410
[LightGBM] [Info] Start training from score -1.350417












[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.681178 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 470075
[LightGBM] [Info] Number of data points in the train set: 60089, number of used features: 9056
[LightGBM] [Info] Start training from score -0.514032
[LightGBM] [Info] Start training from score -1.946393
[LightGBM] [Info] Start training from score -1.350417
Best Cross-Validation Accuracy: 0.8770488895100389


In [5]:
# Best parameters
best_params = grid_search.best_params_
print(f'Best Parameters: {best_params}')

Best Parameters: {'learning_rate': 0.1, 'n_estimators': 500, 'num_leaves': 200}


In [6]:
# Best cross-validation score
best_cv_score = grid_search.best_score_
print(f'Best Cross-Validation Accuracy: {best_cv_score * 100:.4f}%')

Best Cross-Validation Accuracy: 87.7049%
