In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Citirea datelor

In [None]:
url = 'https://github.com/berinde/curs-analiza-datelor-complexe/blob/main/data/input/3.input_data_prepped_bow.csv?raw=True'
reviews = pd.read_csv(url)
reviews.head(2)

In [None]:
reviews.shape

In [None]:
url = 'https://github.com/berinde/curs-analiza-datelor-complexe/blob/main/data/input/dtm_1_bow.parquet?raw=True'
dtm_bow = pd.read_parquet(url)

In [None]:
dtm_bow.shape

# Train test split

In [None]:
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(
    dtm_bow,
    reviews['positive'],
    train_size=0.8,
    random_state=42
    )

In [None]:
print(len(X_train_bow), len(X_test_bow), len(y_train_bow), len(y_test_bow))

# Model

In [None]:
#initializarea obiectului
classifier_rf = RandomForestClassifier(random_state=42, n_jobs=10, max_depth=5,
                                       n_estimators=100, oob_score=True)


In [None]:
#training
classifier_rf.fit(X_train_bow, y_train_bow)

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [None]:
#generate predeictions
y_test_bow_preds = classifier_rf.predict(X_test_bow)

In [None]:
y_test_bow_preds

In [None]:
print('Classification Report pe setul de test\n',
      classification_report(y_test_bow, y_test_bow_preds)
      )

## Grid Search

In [None]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1)

params = {
    'max_depth': [5,10,20],
    'min_samples_leaf': [50,100,200],
    'n_estimators': [30,50,100]
}


# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf,
                           param_grid=params,
                           cv = 4,
                           n_jobs=-1, verbose=1, scoring="accuracy")

grid_search.fit(X_train_bow, y_train_bow)

In [None]:
grid_search.best_score_

In [None]:
rf_best = grid_search.best_estimator_
rf_best

In [None]:
#generate predeictions
y_test_bow_preds_grid = rf_best.predict(X_test_bow)

In [None]:
print('Classification Report pe setul de test\n',
      classification_report(y_test_bow, y_test_bow_preds_grid)
      )