In [None]:
!pip install xgboost
!pip install fastparquet

In [None]:
import pandas as pd
import numpy as no
import matplotlib.pyplot as plt
import fastparquet


from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Citirea datelor

In [None]:
url = 'https://github.com/berinde/curs-analiza-datelor-complexe/blob/main/data/input/3.input_data_prepped_bow.csv?raw=True'
reviews = pd.read_csv(url)
reviews.head(2)

In [None]:
reviews.shape

In [None]:
url = 'https://github.com/berinde/curs-analiza-datelor-complexe/blob/main/data/input/dtm_1_bow.parquet?raw=True'
dtm_bow = pd.read_parquet(url, engine='fastparquet')

In [None]:
dtm_bow.shape

# Train test split

In [None]:
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(
    dtm_bow,
    reviews['positive'],
    train_size=0.8,
    random_state=42
    )

In [None]:
print(len(X_train_bow), len(X_test_bow), len(y_train_bow), len(y_test_bow))

# Model

In [None]:
model = XGBClassifier(booster='gbtree',
                      objective='binary:logistic',
                      verbosity=3, #showing logs
                      learning_rate=0.1,
                      n_estimators=200,
                      max_depth=8,
                      max_leaves=32,
                      subsample=0.95,
                      eval_metric='auc'
                     )

In [None]:
#training
model.fit(X_train_bow, y_train_bow, eval_set=[(X_test_bow, y_test_bow)])

https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster

In [None]:
#generate predeictions
y_test_bow_preds = model.predict(X_test_bow)

In [None]:
y_test_bow_preds

In [None]:
print('Classification Report pe setul de test\n',
      classification_report(y_test_bow, y_test_bow_preds)
      )

## Multi-label Classification

## train test split

In [None]:
#numaratul incepe de la 0
reviews['rating'] = reviews['rating']-1

In [None]:
reviews['rating'][reviews['rating']==3] = 2

In [None]:
reviews['rating'][reviews['rating']==4] = 3

In [None]:
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(
    dtm_bow,
    reviews['rating'],
    train_size=0.8,
    random_state=42
    )

## Model

In [None]:
model = XGBClassifier(booster='gbtree',
                      objective='multi:softmax',
                      num_class=4,
                      verbosity=3, #showing logs
                      learning_rate=0.1,
                      n_estimators=200,
                      max_depth=8,
                      max_leaves=32,
                      subsample=0.95,
                      eval_metric='auc'
                     )

In [None]:
#training
model.fit(X_train_bow, y_train_bow, eval_set=[(X_test_bow, y_test_bow)])

In [None]:
#generate predeictions
y_test_bow_preds = model.predict(X_test_bow)

In [None]:
print('Classification Report pe setul de test\n',
      classification_report(y_test_bow, y_test_bow_preds)
      )