# LightGBM


-------------------------------------------------------------------

----------------------------------------------------------------------


## Imports and functions

In [1]:
DSET_FOLDER_PATH = './dataset/quora/'
import pandas as pd
from tqdm import tqdm 
tqdm.pandas()
import numpy as np
train_dset_df = pd.read_csv(DSET_FOLDER_PATH + "train.csv")
test_dset_df = pd.read_csv(DSET_FOLDER_PATH + "test.csv")

In [2]:
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold

def summarize(model, X, y):
    yhat = np.round(model.predict(X))
    print("F1 score:", f1_score(y, yhat))
    print("Precision:", precision_score(y, yhat))
    print("Recall:", recall_score(y, yhat))
    print("Confusion matrix:")
    print(confusion_matrix(y, yhat))
    return f1_score(y, yhat)

def cross_validate(model, n_folds, X, y):
    kfcv = StratifiedKFold(n_splits=n_folds)
    i = 0
    validation_f1_scores = []
    for train_indices, test_indices in kfcv.split(X,y):
        print("Round number", i)
        i += 1
        trainset_X = X[train_indices,:]
        testset_X  = X[test_indices,:]
        trainset_y = y[train_indices]
        testset_y  = y[test_indices]
        model.fit(trainset_X, trainset_y)
        print("\n\nTraining:")
        summarize(model,trainset_X, trainset_y)
        print("Testing:")
        validation_f1_scores.append(summarize(model, testset_X, testset_y))
        print(("-"*15))
    validation_f1_scores = np.array(validation_f1_scores)
    print("Mean validation f1 score:", np.mean(validation_f1_scores))
    print("Median validation f1 score:", np.median(validation_f1_scores))

## Vectorization

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(lowercase=False, token_pattern=r"(?u)\b\w\w+\b|!|\?|\"|\'", ngram_range=(1,3))

In [4]:
train_X = vectorizer.fit_transform(train_dset_df["question_text"])
train_y = train_dset_df["target"].to_numpy()

## Model Cross-Validation

In [5]:
from lightgbm import LGBMClassifier

In [6]:
model = model = LGBMClassifier(num_leaves=16, learning_rate=0.1, n_estimators=1000, silent=False, class_weight={0:1,1:2.5}, verbose=3)

In [7]:
cross_validate(model, 10, train_X, train_y)

Round number 0
[LightGBM] [Info] Number of positive: 43606, number of negative: 661699
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.999807
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.999798
[LightGBM] [Debug] init for col-wise cost 432.594212 seconds, init for row-wise cost 435.354900 seconds
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1103280
[LightGBM] [Info] Number of data points in the train set: 705305, number of used features: 110577
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.141447 -> initscore=-1.803325
[LightGBM] [Info] Start training from score -1.803325
[LightGBM] [Debug] Trained a tree with leaves = 16 and max_depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 16 and max_depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 16 and max_depth = 12
[LightGBM] [Debug] Trained a tree with leaves = 16 and max_depth = 11
[LightGBM] [Debug] Trained a tree with leaves

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(lowercase=False, token_pattern=r"(?u)\b\w\w+\b|!|\?|\"|\'", ngram_range=(1,3))

lsvm = LinearSVC(C=0.24, class_weight={0:1,1:1}, max_iter=10000)

logistic = LogisticRegression(C=1, class_weight = {0:1,1:3.8}, max_iter=1000, n_jobs=6)

model=Logistic_Separator(lsvm, logistic)


trigrams

Mean validation f1 score: 0.633484221217779

Median validation f1 score: 0.633362960290482

## Testset Write

In [30]:
# test_X  = vectorizer.transform(test_dset_df["question_text"])
# model.fit(train_X, train_y)

Fitting the base model.
Fitting the intermediate


In [31]:
# test_yhat = model.predict(test_X)
# output_df = test_dset_df.copy()
# output_df.info()
# output_df.drop(inplace=True, axis=1, labels="question_text")
# output_df["preprocessed_joined"] = test_yhat
# output_df = output_df.rename(columns={"qid":"qid", "preprocessed_joined":"target"})
# output_df.target = output_df.target.apply(round)
# output_df.to_csv("./outputs/2020_11_28_c_testset_output.csv", index=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522449 entries, 0 to 522448
Data columns (total 2 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   qid            522449 non-null  object
 1   question_text  522449 non-null  object
dtypes: object(2)
memory usage: 8.0+ MB


In [32]:
output_df

Unnamed: 0,qid,target
0,f56a9a31974dc66186e8,0
1,d957c3758060f45da303,0
2,ad822d5abaedb9e247b9,0
3,4e979c23eeb6a4bd1f2e,0
4,333cc031262566b8da49,0
...,...,...
522444,e8e6aa5226f36c27fe41,0
522445,015fd068afcb9d0b4007,0
522446,9f0ef49eff6a3ff9e735,0
522447,d6b02f52f76dc4c22afd,0
